-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathBacGenomePipeline.py
338 lines (253 loc) · 13.7 KB
/
BacGenomePipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
#!/usr/bin/env python
import subprocess
import os
import sys
import time
import argparse
import re
import shutil
# terminal output retro, slow function
def typewrite(message):
for char in message:
sys.stdout.write(char)
sys.stdout.flush()
if char != '\n':
time.sleep(0.05)
else:
time.sleep(0.5)
def _usage():
return ("BacGenomePipeline (--pipeline | --pipe_red_mem | --assembly | --annotation)\n"
"--fastq_file READS\n"
"\t --help --version\n"
"\t [--nanostats] [--medaka_polish]\n"
"\t [--mean_q_weight] [--asm_fasta]\n"
"\t [--genome_size SIZE] [--asm_coverage INT]\n"
"\t [--flye_dir DIR_NAME] [--polished_dir DIR_NAME]\n"
"\t [--amr_dir DIR_NAME] [--vir_dir DIR_NAME]\n")
def _epilog():
return ("\nDid you know? BacGenomePipeline can be run in modes\n\n"
"These modes include: pipeline, which runs the entire BacGenomePipeline workflow,\n"
"pipe_red_mem, which uses less memory by using use a subset of the longest reads \n"
"for initial disjointig by specifying --asm-coverage and --genome-size options. \n"
"The assembly mode runs assembly and polishing steps only. For this step, \n"
"annotation is excluded. Finally the annotation mode takes a genome assembly and \n"
"annotates it for antimicrobial and virulence genes \n"
"\nExample Usage:\n"
"BacGenomePipeline --pipeline -f reads.fastq\n"
"BacGenomePipeline --pipeline -f reads.fastq.gz -m -n\n"
"BacGenomePipeline --pipe_red_mem -f reads.fastq -g 5.7m -c 40 -n -m\n"
"BacGenomePipeline --annotation -s assembly.fasta\n"
"BacGenomePipeline --assembly -f reads.fastq -n -m\n")
def fastq_file_validation(input_file):
m = re.search('(.fastq$|fastq.gz$)', input_file)
if not m:
raise argparse.ArgumentTypeError('A valid fastq file must be submitted')
return input_file
def fasta_file_validation(input_file):
m = re.search('(.fasta$)', input_file)
if not m:
raise argparse.ArgumentTypeError('A valid fasta file must be submitted')
return input_file
def get_args():
parser = argparse.ArgumentParser(description='Complete Bacterial Genome Assembly and Annotation Pipeline',
epilog=_epilog(),
formatter_class=argparse.RawDescriptionHelpFormatter,
usage=_usage())
version = parser.add_argument_group(title='Version')
version.add_argument('--version', action="version",
help='Print version and exit.',
version='BacGenomePipeline {}'.format('1.0.0'))
pipeline_mode = parser.add_mutually_exclusive_group(required=True)
pipeline_mode.add_argument('--pipeline',
help='Runs the entire Pipeline',
default=None,
action='store_true',
)
pipeline_mode.add_argument('--pipe_red_mem',
help='Runs the entire Pipeline with reduced memory consumption',
default=None,
action='store_true',
)
pipeline_mode.add_argument('--assembly',
help='Runs the assembly portion of the pipeline',
default=None,
action='store_true',
)
pipeline_mode.add_argument('--annotation',
help='Runs the annotation portion of the pipeline',
default=None,
action='store_true',
)
read_input = parser.add_argument_group(title='Input fastq Reads (a fastq file is required)')
read_input.add_argument('-f', '--fastq_file',
help='Specify an input Fastq file for the Pipeline and assembly modes',
type=fastq_file_validation,
default=None,
metavar='')
pipeline_options = parser.add_argument_group(title='Pipeline Options')
pipeline_options.add_argument('-n', '--nanostats',
help='Optionally run NanoStats on your filtered read set',
action='store_true')
pipeline_options.add_argument('-m', '--medaka_polish',
help='Optionally run NanoStats on your filtered read set',
action='store_true')
pipeline_options.add_argument('-s', '--asm_fasta',
help='Add Genome assembly in fasta format',
default=None,
type=fasta_file_validation)
pipeline_options.add_argument('-w', '--q_weight',
help='Add mean_q_weight for read filtering',
default=7,
type=int)
flye_asm_options = parser.add_argument_group(title='Optional flye assembly arguments\n'
'(To reduce memory consumption for large genome assemblies)')
flye_asm_options.add_argument('-g', '--genome_size',
help='Estimated genome size (for example, 5m or 2.6g)',
required=False,
default=None,
metavar='')
flye_asm_options.add_argument('-c', '--asm_coverage',
default=None,
help="reduced coverage for initial "
"disjointig assembly [not set]",
type=int,
metavar='')
directory_names = parser.add_argument_group(title='Directory names')
directory_names.add_argument('-d', '--flye_dir',
help='Specify a flye genome assembly directory name',
default='flye_dir',
metavar='')
directory_names.add_argument('-p', '--polished_dir',
help='Specify a medaka polished genome directory name',
default='pol_dir',
metavar='')
directory_names.add_argument('-a', '--amr_dir',
help='Specify a antimicrobial resistance directory name',
default='amr_dir',
metavar='')
directory_names.add_argument('-v', '--vir_dir',
help='Specify a virulence gene directory name',
default='virulence_dir',
metavar='')
args = parser.parse_args()
if args.pipe_red_mem:
if (args.asm_coverage is None) or (args.genome_size is None):
parser.error("Assembly Pipeline mode: Pipe_red_mem, requires arguements supplied "
"for asm_coverage and genome_size")
if args.pipeline and (args.fastq_file is None):
parser.error("Assembly Pipeline mode: Pipeline, requires a fastq file as input")
if args.pipe_red_mem and (args.fastq_file is None):
parser.error("Assembly Pipeline mode: pipe_red_mem, requires a fastq file as input")
if args.assembly and (args.fastq_file is None):
parser.error("Assembly Pipeline mode: Assembly, requires a fastq file as input")
if args.annotation and (args.asm_fasta is None):
parser.error("Assembly Pipeline mode: Annotation, requires a fasta file as input")
return args
####linking logic
def main():
args = get_args()
if args.annotation == None:
filtered_reads = args.fastq_file.split('.')[0] + '_filtered.fastq'
raw_assembly = args.fastq_file.split('.')[0] + '_assembly.fasta'
consensus_asm = str(args.fastq_file.split('.')[0] + '_consensus.fasta')
weight = str(args.q_weight)
def filtlong_filtering(raw_read_set):
typewrite('Stage 1: Filtering Reads\n\n')
filtlong_cmds = 'filtlong --min_length 1000 --mean_q_weight ' + weight + \
' --target_bases 500000000 ' + raw_read_set + ' > ' + filtered_reads
os.system(filtlong_cmds)
def nanostats_report(reads):
typewrite('Stage 1a: Producing a Nanostats report of the filtered reads\n\n')
nanostats_cmd = ['NanoStat', '-o', 'NanoStats_Report', '-n', 'nano_report.txt', '--fastq', reads]
FNULL = open(os.devnull, 'w')
subprocess.call(nanostats_cmd, stdout=FNULL, stderr=subprocess.STDOUT)
def flye_genome_assembly(reads):
typewrite('Stage 2: Genome Assembly: \nThis step may take up to 30 minutes, go for a coffee\n\n')
flye_genome_assembly_cmds = ['flye', '-o', str(args.flye_dir), '--plasmids',
'--meta', '--threads', '16', '--nano-raw', reads]
FNULL = open(os.devnull, 'w')
subprocess.call(flye_genome_assembly_cmds, stdout=FNULL, stderr=subprocess.STDOUT)
shutil.move(str(args.flye_dir) + '/assembly.fasta', '.')
os.rename('assembly.fasta', raw_assembly)
def flye_genasm_reduced_ram(reads):
"""flye assembly method adjusted for reduced memory consumption"""
typewrite('Stage 2: Genome Assembly: \nThis step may take up to 30 minutes, go for a coffee\n\n')
typewrite('Reduced memory genome assembly\nlongest reads used for initial disjointig\n\n')
flye_genome_assembly_cmds = ['flye', '-o', str(args.flye_dir), '--plasmids',
'--threads', '10','--genome-size', str(args.genome_size), '--asm-coverage',
str(args.asm_coverage), '--nano-raw', reads]
FNULL = open(os.devnull, 'w')
subprocess.call(flye_genome_assembly_cmds, stdout=FNULL, stderr=subprocess.STDOUT)
shutil.move(str(args.flye_dir) + '/assembly.fasta', '.')
os.rename('assembly.fasta', raw_assembly)
def medaka_post_asm_pol(assembly):
typewrite('Stage 3: Post Assembly Polishing\n\n')
medaka_polishing = ['medaka_consensus', '-i', str(filtered_reads), '-d', assembly,
'-o', str(args.polished_dir), '-t', '10', '-m', 'r941_min_high_g303']
FNULL = open(os.devnull, 'w')
subprocess.call(medaka_polishing, stdout=FNULL, stderr=subprocess.STDOUT)
shutil.move(str(args.polished_dir) + '/consensus.fasta', '.')
os.rename('consensus.fasta', consensus_asm)
def antimicrobial_resistance_gene_detection(consensus_asm):
typewrite('Stage 4a: Antimicrobial Resistance Gene detection\n\n')
amr_input = ['staramr', 'search', '-o', str(args.amr_dir), consensus_asm]
FNULL = open(os.devnull, 'w')
subprocess.call(amr_input, stdout=FNULL, stderr=subprocess.STDOUT)
def virulence_gene_detection(consensus_asm):
typewrite('Stage 4b: Virulence Gene detection\n\n')
os.mkdir(str(args.vir_dir))
vir_cmds = 'abricate --quiet -db vfdb ' + str(consensus_asm) + ' > virulence_metadata.csv'
os.system(vir_cmds)
shutil.move('virulence_metadata.csv', str(args.vir_dir))
if args.annotation == None:
typewrite('Complete Bacterial Genome pipeline Initialized...\n\n')
# Pipeline workflow logic
def pipeline():
filtlong_filtering(raw_read_set=args.fastq_file)
if args.nanostats:
nanostats_report(reads=filtered_reads)
flye_genome_assembly(reads=filtered_reads)
if args.medaka_polish:
medaka_post_asm_pol(assembly=raw_assembly)
antimicrobial_resistance_gene_detection(consensus_asm=consensus_asm)
virulence_gene_detection(consensus_asm=consensus_asm)
else:
antimicrobial_resistance_gene_detection(consensus_asm=raw_assembly)
virulence_gene_detection(consensus_asm=raw_assembly)
typewrite('\u001b[32mSuccess \u2713\n')
def pipeline_reduced_mem_consumption():
filtlong_filtering(raw_read_set=args.fastq_file)
if args.nanostats:
nanostats_report(reads=filtered_reads)
flye_genasm_reduced_ram(reads=filtered_reads)
if args.medaka_polish:
medaka_post_asm_pol(assembly=raw_assembly)
antimicrobial_resistance_gene_detection(consensus_asm=consensus_asm)
virulence_gene_detection(consensus_asm=consensus_asm)
else:
antimicrobial_resistance_gene_detection(consensus_asm=raw_assembly)
virulence_gene_detection(consensus_asm=raw_assembly)
typewrite('\u001b[32mSuccess \u2713\n')
def assembly():
typewrite('Genome Assembly only portion of pipeline initiated...')
filtlong_filtering(raw_read_set=args.fastq_file)
if args.nanostats:
nanostats_report(reads=filtered_reads)
flye_genome_assembly(reads=filtered_reads)
if args.medaka_polish:
medaka_post_asm_pol(assembly=raw_assembly)
def annotation():
typewrite('Annotation only portion of pipeline initiated...\n\n')
antimicrobial_resistance_gene_detection(consensus_asm=args.asm_fasta)
virulence_gene_detection(consensus_asm=args.asm_fasta)
typewrite('\u001b[32mSuccess \u2713\n')
if args.pipeline:
pipeline()
if args.pipe_red_mem:
pipeline_reduced_mem_consumption()
if args.assembly:
assembly()
if args.annotation:
annotation()
if __name__ == '__main__':
main()