-
Notifications
You must be signed in to change notification settings - Fork 2
/
probetools_v_0_1_11.py
774 lines (711 loc) · 38.9 KB
/
probetools_v_0_1_11.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
#!/usr/bin/env python3
import sys
import subprocess
import os
def main():
version = '0.1.11'
# Parse command line arguments
module, args = parse_args(sys.argv, version)
# Set path to output directory and name to append to output files
out_path, name = os.path.split(args['-o'])
out_path = '.' if out_path == '' else out_path
# Make sure output directory exists
if os.path.exists(out_path) == False and os.path.isdir(out_path) == False:
print(f'\nERROR: Output path {out_path} does not exist.\n')
exit(1)
# Run top-level function for selected module
if module == 'clusterkmers':
print(f'\nProbeTools ClusterKmers v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
cluster_kmers(out_path, name, args['-t'], args['-k'], args['-i'], args['-s'], args['-d'], args['-p'], args['-n'], args['-T'])
elif module == 'capture':
print(f'\nProbeTools Capture v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
capture(out_path, name, args['-t'], args['-p'], args['-i'], args['-l'], args['-T'])
elif module == 'getlowcov':
print(f'\nProbeTools GetLowCov v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
get_low_cov(out_path, name, args['-i'], args['-k'], args['-D'], args['-L'])
elif module == 'stats':
print(f'\nProbeTools Stats v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
stats(out_path, name, args['-i'])
elif module == 'makeprobes':
print(f'\nProbeTools MakeProbes v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
make_probes(out_path, name, args['-t'], args['-b'], args['-m'], args['-c'], args['-k'], args['-i'], args['-s'], args['-d'],
args['-D'], args['-L'], args['-i'], args['-l'], args['-T'])
elif module == 'merge':
print(f'\nProbeTools Merge v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
merge(args['-i'], args['-I'], args['-o'])
print('\nDone.\n')
exit(0)
########## Command line interface functions ##########
def parse_args(args, version):
if len(args) < 2:
print('\nERROR: A ProbeTools module must be selected.\n')
print_usage(None, version)
exit(1)
else:
module = args[1]
arg_values = {}
for arg_1, arg_2 in zip(args[1:-1], args[2:]):
if arg_1[0] == '-':
if arg_2[0] != '-':
arg_values[arg_1] = arg_2
else:
arg_values[arg_1] = ''
if args[-1][0] == '-':
arg_values[args[-1]] = ''
# Set defaults, mins, and maxs depending on selected module
if module == 'clusterkmers':
required_args = {'-t', '-o'}
arg_value_types = {'-t': str, '-o': str, '-k': int, '-s': int, '-d': int, '-i': float, '-p': str, '-n': int, '-T': int}
min_arg_values = {'-k': 32, '-s': 1, '-d': 0, '-i': 50, '-n': 1, '-T': 1}
max_arg_values = {'-i': 100}
default_arg_values = {'-k': 120, '-s': 1, '-d': 0, '-i': 90, '-p': None, '-n': 'MAX', '-T': 0}
elif module == 'capture':
required_args = {'-t', '-p', '-o'}
arg_value_types = {'-t': str, '-p': str, '-o':str, '-i': float, '-l': int, '-T': int}
min_arg_values = {'-i': 50, '-l': 1, '-T': 1}
max_arg_values = {'-i': 100}
default_arg_values = {'-i': 90, '-l': 60, '-T': 1}
elif module == 'getlowcov':
required_args = {'-i', '-o'}
arg_value_types = {'-i': str, '-o': str, '-k': int, '-D': int, '-L': int}
min_arg_values = {'-k': 32, '-D': 0, '-L': 1}
max_arg_values = {}
default_arg_values = {'-k': 120, '-D': 0, '-L': 40}
elif module == 'stats':
required_args = {'-i', '-o'}
arg_value_types = {'-i': str, '-o': str}
min_arg_values = {}
max_arg_values = {}
default_arg_values = {}
elif module == 'makeprobes':
required_args = {'-t', '-b', '-o'}
arg_value_types = {'-t': str, '-b': int, '-o': str, '-m': int, '-c': float, '-k': int,
'-s': int, '-d': int, '-D': int, '-L': int, '-i': float, '-l': int, '-T': int}
min_arg_values = {'-m': 1, '-c': 0, '-k': 32, '-s': 1, '-d': 0, '-D': 0, '-L': 1, '-i': 50, '-l': 1, '-T': 1}
max_arg_values = {'-c': 100, '-i': 100}
default_arg_values = {'-m': 'MAX', '-c': 90, '-k': 120, '-s': 1, '-d': 0, '-D': 0, '-L': 40, '-i': 90, '-l': 60, '-T': 0}
elif module == 'merge':
required_args = {'-i', '-I', '-o'}
arg_value_types = {'-i': str, '-I': str, '-o': str}
min_arg_values = {}
max_arg_values = {}
default_arg_values = {}
else:
print('\nERROR: Module not recognized.')
print_usage(None, version)
exit(1)
# Check if all required arguments were provided
missing_args = set()
for required_arg in required_args:
if required_arg not in arg_values.keys() or arg_values[required_arg] == '':
missing_args = missing_args | {required_arg}
if missing_args != set():
print(f'\nERROR: Values must be provided for the argument following arguments: {", ".join(sorted(missing_args))}')
print_usage(module, version)
exit(1)
# Check if unrecognized arguments were provided
recognized_args = required_args | set(arg_value_types.keys())
unrecognized_args = set()
for provided_arg in arg_values.keys():
if provided_arg not in recognized_args:
unrecognized_args = unrecognized_args | {provided_arg}
if unrecognized_args != set():
print(f'\nERROR: The following arguments are not recognized: {", ".join(sorted(unrecognized_args))}')
print_usage(module, version)
exit(1)
# Check if arguments were provided without values
empty_args = set()
for arg, value in arg_values.items():
if value == '':
empty_args = empty_args | {arg}
if empty_args != set():
print(f'\nERROR: The following arguments were provided without values: {", ".join(sorted(empty_args))}')
print_usage(module, version)
exit(1)
# Check if provided values are of the correct type
for arg, value in arg_values.items():
try:
arg_values[arg] = arg_value_types[arg](value)
except ValueError:
print(f'\nERROR: Value for argument {arg} must be of type {str(arg_value_types[arg].__name__)}')
print_usage(module, version)
exit(1)
# Check if provided values are within the correct range
for arg, value in arg_values.items():
if arg in min_arg_values.keys() and value < min_arg_values[arg]:
print(f'\nERROR: Value for argument {arg} must be at least {min_arg_values[arg]}')
print_usage(module, version)
exit(1)
if arg in max_arg_values.keys() and value > max_arg_values[arg]:
print(f'\nERROR: Value for argument {arg} must not exceed {max_arg_values[arg]}')
print_usage(module, version)
exit(1)
# Assign default values to unspecified arguments
for arg, value in default_arg_values.items():
if arg not in arg_values.keys():
arg_values[arg] = value
# Return keyword args and their values
return module, arg_values
def print_usage(module, version):
if module == None:
print(f'\nProbeTools v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
print('Available modules:')
print('makeprobes - probe panel design using a general purpose incremental strategy')
print('clusterkmers - enumerate and cluster kmers from target sequences')
print('capture - assess probe panel coverage of target sequences')
print('getlowcov - extract low coverage sequences from target space')
print('stats - calculate probe coverage and depth stats')
print('merge - merge two sets of capture results\n')
elif module == 'clusterkmers':
print(f'\nProbeTools ClusterKmers v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
print('Usage: probetools clusterkmers -t <target seqs> -o <path to output directory>/<output name> [<optional args>]\n')
print('Required arguments:')
print(' -t : path to target seqs in FASTA file')
print(' -o : path to output directory and name to append to output files')
print('Optional arguments:')
print(' -k : length of kmers to enumerate (default=120, min=32)')
print(' -s : number of bases separating each kmer (default=1, min=1)')
print(' -d : number of degenerate bases to permit in probes (default=0, min=0)')
print(' -i : nucleotide sequence identity (%) threshold used for kmer clustering (default=90, min=50, max=100)')
print(' -p : path to FASTA file containing previously-generated probe sequences to filter from new probes')
print(' -n : number of probe candidates to return (default=MAX, min=1)')
print(' -T : number of threads used by VSEARCH for clustering kmers (default=MAX, min=1)\n')
elif module == 'capture':
print(f'\nProbeTools Capture v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
print('Usage: probetools capture -t <target seqs> -p <probe seqs> -o <path to output directory>/<output name> [<optional args>]\n')
print('Required arguments:')
print(' -t : path to target seqs in FASTA file')
print(' -p : path to probe sequences in FASTA file')
print(' -o : path to output directory and name to append to output files')
print('Optional arguments:')
print(' -i : nucleotide sequence identity (%) threshold used for probe-target alignments (default=90, min=50, max=100)')
print(' -l : minimum length for probe-target alignments (default=60, min=1)')
print(' -T : number of threads used by BLASTn for aligning probes to targets (default=1, min=1)\n')
elif module == 'getlowcov':
print(f'\nProbeTools GetLowCov v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
print('Usage: probetools getlowcov -i <input file> -o <path to output directory>/<output name> [<optional args>]')
print('Required arguments:')
print(' -i : path to capture results in PT file')
print(' -o : path to output directory and name to append to output files')
print('Optional arguments:')
print(' -k : minimum sub-sequence length extracted, should be same as kmer length used for making probes (default=120, min=32)')
print(' -D : minimum probe depth threshold used to define low coverage sub-sequences (default=0, min=0)')
print(' -L : minimum number of consecutive bases below probe depth threshold to define a low coverage sub-sequence (default=40, min=1)')
elif module == 'stats':
print(f'\nProbeTools Stats v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
print('Usage: probetools stats -i <input file> -o <path to output directory>/<output name>')
print('Required arguments:')
print(' -i : path to capture results in PT file')
print(' -o : path to output directory and name to append to output files\n')
elif module == 'makeprobes':
print(f'\nProbeTools MakeProbes v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
print('Usage: probetools incrementalprobes -t <target seqs> -b <batch size> -o <path to output directory>/<output name> [<optional args>]')
print('Required arguments:')
print(' -t : path to target sequences in FASTA file')
print(' -b : number of probes in each batch (min=1)')
print(' -o : path to output directory and name to append to output files')
print('Optional arguments:')
print(' -m : max number of probes to add to panel (default=MAX, min=1)')
print(' -c : target for 10th percentile of probe coverage (default=90, min=1, max=100)')
print(' -k : length of probes to generate (default=120, min=32)')
print(' -s : number of bases separating each kmer (default=1, min=1)')
print(' -d : number of degenerate bases to permit in probes (default=0, min=0)')
print(' -i : nucleotide sequence identity (%) threshold used for kmer clustering and probe-target alignments (default=90, min=50, min=100)')
print(' -l : minimum length for probe-target alignments (default=60, min=1)')
print(' -D : minimum probe depth threshold used to define low coverage sub-sequences (default=0, min=0)')
print(' -L : minimum number of consecutive bases below probe depth threshold to define a low coverage sub-sequence (default=40, min=1)')
print(' -T : number of threads used by VSEARCH and BLASTn for clustering kmers and aligning'
f' probes to targets (default=MAX for VSEARCH, default=1 for BLASTn, min=1)\n')
elif module == 'merge':
print(f'\nProbeTools Merge v{version}')
print('https://github.com/KevinKuchinski/ProbeTools\n')
print('Usage: probetools merge -i <input file 1> -I <input file 2> -o <path to output file>')
print('Required arguments:')
print(' -i : path to capture results in PT file')
print(' -I : path to other capture results in PT file')
print(' -o : path to merged capture results PT output file\n')
########## Helper functions ##########
def check_input(files):
'''Checks list of input file paths to make sure they exist.'''
for file in files:
if os.path.exists(file) == False:
print(f'\nERROR: Input file {file} does not exist.\n')
exit(1)
def load_fasta(fasta_path):
'''Loads contents of a FASTA file into two lists (one for headers, and another for their
corresponding seqs).'''
# Check if targets FASTA is valid
check_input([fasta_path])
# Open targets FASTA and load headers and seqs into lists
with open(fasta_path, 'r') as input_file:
entry_counter = 0
header = ''
headers, seqs = [], []
for line in input_file:
if line[0] == '>':
entry_counter += 1
header = line.strip().lstrip('>')
headers.append(header)
seqs.append('')
elif header != '':
seqs[-1] += line.strip()
for header, seq in zip(headers, seqs):
if header == '':
print('\nWARNING: Seq is present without header in {fasta_path}.')
if seq == '':
print('\nWARNING: Header {header} in {fasta_path} has no accompanying seq.')
return headers, seqs
def append_fasta(fasta_path, new_fasta_path):
'''Appends contents of new FASTA file to existing FASTA file then deletes new FASTA file.'''
with open(new_fasta_path, 'r') as input_file, open(fasta_path, 'a') as output_file:
for line in input_file:
output_file.write(line)
# GARBAGE COLLECTION - Delete new FASTA file after contents have been appended
os.remove(new_fasta_path)
######### Top-level functions for modules ##########
def cluster_kmers(out_path, name, targets_path, k, cluster_id, step, max_degen, prev_probes_path, num_probes, threads):
check_input([targets_path])
kmers_path = os.path.join(out_path, name + '_kmers.fa')
enum_kmers(targets_path, kmers_path, k, step, max_degen)
centroids_path = os.path.join(out_path, name + '_centroids.fa')
cluster_kmers_with_VSEARCH(kmers_path, centroids_path, cluster_id, threads)
remove_prev_probes(centroids_path, prev_probes_path)
probe_names, probe_seqs = rank_probe_candidates(centroids_path)
probes_path = os.path.join(out_path, name + '_probes.fa')
potential_probes, probes_writen = write_top_probes(probe_names, probe_seqs, num_probes, probes_path, name)
return potential_probes, probes_writen
def capture(out_path, name, targets_path, probes_path, min_id, min_length, threads):
check_input([targets_path, probes_path])
blast_path = os.path.join(out_path, name + '_blast_results.tsv')
align_probes_to_targets_with_BLAST(targets_path, probes_path, blast_path, threads)
capture_data = create_empty_capture_data(targets_path)
capture_data = add_BLAST_results_to_capture_data(blast_path, capture_data, min_id, min_length)
capture_path = os.path.join(out_path, name + '_capture.pt')
write_capture_data(capture_path, capture_data)
def get_low_cov(out_path, name, capture_path, k, min_depth, min_length):
check_input([capture_path])
capture_data = load_capture_data(capture_path)
low_cov_path = os.path.join(out_path, name + '_low_cov_seqs.fa')
seqs_writen = write_low_cov_seqs(capture_data, low_cov_path, k, min_depth, min_length)
def stats(out_path, name, capture_path):
'''Top-level function for stats module.'''
check_input([capture_path])
capture_data = load_capture_data(capture_path)
report_path = os.path.join(out_path, name + '_summary_stats_report.tsv')
write_summary_report(capture_data, report_path, name)
report_path = os.path.join(out_path, name + '_long_stats_report.tsv')
write_long_report(capture_data, report_path, name)
def make_probes(out_path, name, targets_path, batch_size, max_probes, cov_target, k, cluster_id, step, max_degen,
min_depth, min_low_cov_length, min_id, min_capture_length, threads):
check_input([targets_path])
# Set variable for counting rounds of incremental probe design
round_counter = 0
# Initialize variables for panel size (# probes) and 10th percentile of panel coverage
panel_size, panel_cov = 0, 0
# If panel size is not specified, set max_panel_size as 1 probe larger than current panel size so loop doesn't break
max_panel_size = panel_size + batch_size if max_probes == 'MAX' else max_probes
# Create empty FASTA file for final probe panel
final_probes_path = os.path.join(out_path, name + '_probes.fa')
with open(final_probes_path, 'w') as output_file:
pass
# Create empty capture dict from target space
capture_data = create_empty_capture_data(targets_path)
print()
# Enter main incremental design loop
while panel_size < max_panel_size and panel_cov < cov_target:
round_counter += 1
print('*' * 20, f'ROUND {round_counter}', '*' * 20 + '\n')
# Write low cov seqs to FASTA as target space for this round
low_cov_path = os.path.join(out_path, name + '_low_cov_seqs.fa')
seqs_writen = write_low_cov_seqs(capture_data, low_cov_path, k, min_depth, min_low_cov_length)
print()
# Break design loop if no low seqs were writen
if seqs_writen == 0:
break
# Make probes from target space
num_probes = min(batch_size, max_panel_size - panel_size)
round_name = name + f'_round_{round_counter}'
potential_probes, probes_writen = cluster_kmers(out_path, round_name, low_cov_path, k, cluster_id, step, max_degen, final_probes_path, num_probes, threads)
print()
# GARBAGE COLLECTION - Delete low cov seqs
os.remove(low_cov_path)
# Update panel size and max panel size
panel_size += probes_writen
max_panel_size = panel_size + batch_size if max_probes == 'MAX' else max_probes
# Break design loop if no potential probes or no probes writen
if potential_probes == 0 or probes_writen == 0:
break
# Capture probes against original targets
blast_path = os.path.join(out_path, name + '_blast_results.tsv')
new_probes_path = os.path.join(out_path, round_name + '_probes.fa')
align_probes_to_targets_with_BLAST(targets_path, new_probes_path, blast_path, threads)
capture_data = add_BLAST_results_to_capture_data(blast_path, capture_data, min_id, min_capture_length)
print()
# Append new probes to final panel
append_fasta(final_probes_path, new_probes_path)
# Check if panel cov exceeds cov target
panel_cov = calc_cov_percentiles(capture_data, percentiles=(0.1,))[0]
print(f'10th percentile of target coverage: {panel_cov}%\n')
print('Incremental probe design finished.')
if panel_size >= max_panel_size:
print(' Maximum panel size reached.')
if panel_cov >= cov_target:
print(' Coverage target for 10th percentile of targets reached.')
if seqs_writen == 0 or potential_probes == 0 or probes_writen == 0:
print(' No additional probes could be designed.')
capture_path = os.path.join(out_path, name + '_capture.pt')
print(f'\nWriting capture results to {capture_path}...')
write_capture_data(capture_path, capture_data)
report_path = os.path.join(out_path, name + '_summary_stats_report.tsv')
write_summary_report(capture_data, report_path, name)
report_path = os.path.join(out_path, name + '_long_stats_report.tsv')
write_long_report(capture_data, report_path, name)
def merge(capture_path, other_capture_path, merged_capture_path):
capture_data = load_capture_data(capture_path)
other_capture_data = load_capture_data(other_capture_path)
merged_capture_data = merge_capture_results(capture_data, other_capture_data)
write_capture_data(merged_capture_path, merged_capture_data)
########## clusterkmers functions ##########
def enum_kmers(targets_path, kmers_path, k, step, max_degen):
print(f'Enumerating all {k}-mers in {targets_path}...')
# Load contents of targets FASTA
headers, seqs = load_fasta(targets_path)
print(f' Loaded {"{:,}".format(len(seqs))} target seqs in {targets_path}...')
# Create FASTA file for kmers output and enumerate kmers from target seqs
with open(kmers_path, 'w') as output_file:
target_counter = 0
kmer_counter = 0
for header, seq in zip(headers, seqs):
if seq == '':
print(f' WARNING --- Target {header} has no sequence!')
elif len(seq) < k:
print(f' WARNING --- Target {header} is shorter than the desired probe length!')
elif len(seq) >= k and seq != '':
target_counter += 1
for i in range(0, len(seq) - k + 1, step):
kmer = seq[i:i+k].upper()
num_degen = len(kmer) - sum(kmer.count(base) for base in 'ATGC')
if num_degen <= max_degen:
kmer_counter += 1
output_file.write(f'>kmer_{kmer_counter}\n')
output_file.write(kmer + '\n')
print(f' Enumerated {"{:,}".format(kmer_counter)} k-mers from {targets_path}.')
def cluster_kmers_with_VSEARCH(kmers_path, centroids_path, cluster_id, threads):
print(f' Clustering k-mers at {cluster_id}% identity...')
# Create and run terminal command for VSEARCH
terminal_command = (f'vsearch --cluster_fast {kmers_path} --id {cluster_id / 100} --centroids {centroids_path}'
f' --fasta_width 0 --sizeout --qmask none --threads {threads}')
finished_process = subprocess.call(terminal_command, stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL, shell=True)
if finished_process != 0:
print(f'\nERROR: vsearch cluster_fast terminated with errors while clustering k-mers'
f' (Error code: {finished_process}).\n')
exit(1)
# GARBAGE COLLECTION - Delete kmers FASTA file
os.remove(kmers_path)
print(f' K-mer clustering finished.')
def remove_prev_probes(centroids_path, prev_probes_path):
if prev_probes_path != None:
print(f'Removing any previously designed probes...')
prev_probe_headers, prev_probe_seqs = load_fasta(prev_probes_path)
print(f' Loaded {"{:,}".format(len(prev_probe_seqs))} previously designed probes in {prev_probes_path}')
centroid_headers, centroid_seqs = load_fasta(centroids_path)
with open(centroids_path, 'w') as output_file:
for header, seq in zip(centroid_headers, centroid_seqs):
if seq not in prev_probe_seqs:
output_file.write('>' + header + '\n')
output_file.write(seq + '\n')
def rank_probe_candidates(centroids_path):
print(f'Ranking probe candidates...')
# Load centroids FASTA file
headers, seqs = load_fasta(centroids_path)
print(f' Loaded {"{:,}".format(len(seqs))} probe candidates.')
# Create dicts for looking up each candidate probe seq's cluster size and probe name
cluster_sizes = {seq: int(header.split('size=')[1].split(';')[0]) for header, seq in zip(headers, seqs)}
probe_names = {seq: header.split(';')[0] for header, seq in zip(headers, seqs)}
# Rank candidate probe seqs based on cluster size
probe_seqs = sorted(seqs, key=lambda seq: cluster_sizes[seq], reverse=True)
# GARBAGE COLLCTION - Delete centroids FASTA file
os.remove(centroids_path)
return probe_names, probe_seqs
def write_top_probes(probe_names, probe_seqs, num_probes, probes_path, name):
print(f'Writing top probe candidates to {probes_path}...')
potential_probes = len(probe_seqs)
# Check if there are any probes to write
if potential_probes == 0:
print(' No probe candidates to write to FASTA.')
return 0, 0
# Determine how many probes to write to FASTA file
if num_probes == 'MAX':
max_probes = len(probe_seqs)
else:
max_probes = num_probes
# Write probes to FASTA file
probe_counter = 0
with open(probes_path, 'w') as output_file:
while probe_counter < len(probe_seqs) and probe_counter < max_probes:
probe_seq = probe_seqs[probe_counter]
probe_counter += 1
output_file.write(f'>{name}_probe_{probe_counter}\n')
output_file.write(probe_seq + '\n')
print(f' Wrote {"{:,}".format(probe_counter)} probes to {probes_path}.')
return potential_probes, probe_counter
########## capture functions ##########
def align_probes_to_targets_with_BLAST(targets_path, probes_path, blast_path, threads):
print(f'Aligning probes in {probes_path} to targets in {targets_path}...')
# Check that BLAST db files exist for target seqs
if any([os.path.exists(targets_path + '.' + suffix) == False for suffix in ['nhr', 'nin' , 'nsq']]):
print(' WARNING: blastn db files do not exist for target seqs. Creating blastn db...')
# Create terminal command for making BLASTn db
terminal_command = f'makeblastdb -in {targets_path} -dbtype nucl'
# Run terminal command and redirect stdout and stderr to DEVNULL
finished_process = subprocess.run(terminal_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True)
# Exit with errors if BLASTn terminates with errors
if finished_process.returncode != 0:
print(f'\nERROR: makeblastdb terminated with errors while making db for target seqs (Error code: {finished_process.returncode}).\n')
exit(1)
# Count target seqs
headers, seqs = load_fasta(targets_path)
num_targets = len(seqs)
# Create and run terminal command for BLASTn
threads = 1 if threads == 0 else threads
terminal_command = (f"blastn -db {targets_path} -query {probes_path} -max_target_seqs {num_targets}"
f" -num_threads {threads} -outfmt 6 > {blast_path}")
finished_process = subprocess.run(terminal_command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, shell=True)
if finished_process.returncode != 0:
print(f'\nERROR: blastn terminated with errors while aligning probes to target seqs (Error code: {finished_process.returncode}).\n')
exit(1)
def create_empty_capture_data(targets_path):
print(f'Extracting target names and seqs from {targets_path}...')
with open(targets_path, 'r') as input_file:
headers, seqs = [], []
for line in input_file:
if line[0] == '>':
header = line.strip().lstrip('>')
headers.append(header)
seqs.append('')
else:
seqs[-1] += line.strip()
for header in headers:
if headers.count(header) > 1:
print(f'\nERROR: Header {header} appears more than once in {targets_path}.\n')
exit(1)
if ' ' in header:
print(f'\nERROR: Header {header} contains spaces.\n')
exit(1)
capture_data = {header: (seq, [0] * len(seq)) for header, seq in zip(headers, seqs)}
print(f' Total targets loaded: {"{:,}".format(len(capture_data))}')
return capture_data
def load_capture_data(capture_path):
print(f'Loading capture data from {capture_path}...')
with open(capture_path, 'r') as input_file:
headers, seqs, depths = [], [], []
for line in input_file:
if line[0] == '>':
header = line.strip().lstrip('>')
headers.append(header)
seqs.append('')
depths.append('')
elif line[0] == '$':
seqs[-1] += line.strip()
elif line[0] == '#':
depths[-1] += line.strip()
seqs = [seq.lstrip('$') for seq in seqs]
depths = [[int(d) for d in depth.lstrip('#').split(',')] for depth in depths]
if len(set(len(c) for c in [headers, seqs, depths])) != 1:
print(f'\nERROR: The number of headers, seqs, and probe depth lists do not match in {capture_path}.')
exit(1)
for header in headers:
if headers.count(header) > 1:
print(f'\nERROR: Header {header} appears more than once in {capture_path}.\n')
exit(1)
capture_data = {header: (seq, depth) for header, seq, depth in zip(headers, seqs, depths)}
for header in capture_data.keys():
if len(capture_data[header][0]) != len(capture_data[header][1]):
print(f'\nERROR: Seq length and probe depth list length do not match for entry {header}.')
exit(1)
print(f' Total targets loaded: {"{:,}".format(len(capture_data))}')
return capture_data
def merge_capture_results(capture_data, other_capture_data):
headers, other_headers = set(capture_data.keys()), set(other_capture_data.keys())
shared_headers = headers & other_headers
merged_capture_data = {}
for header in shared_headers:
if capture_data[header][0] != other_capture_data[header][0]:
print(f'\nERROR: Header {header} appears in both capture results but they do not have the same seq.')
exit(1)
if len(capture_data[header][1]) != len(other_capture_data[header][1]):
print(f'\nERROR: Header {header} appears in both capture results but their probe depth lists are different lengths.')
exit(1)
merged_capture_data[header] = (capture_data[header][0], [a + b for a, b in zip(capture_data[header][1], other_capture_data[header][1])])
for header in headers - shared_headers:
merged_capture_data[header] = capture_data[header]
for header in other_headers - shared_headers:
merged_capture_data[header] = other_capture_data[header]
return merged_capture_data
def add_BLAST_results_to_capture_data(blast_path, capture_data, min_id, min_length):
print(f'Extracting probe coverage and probe depth from alignment of probes against targets...')
with open(blast_path, 'r') as input_file:
for line in input_file:
line = line.split('\t')
if float(line[2]) >= min_id and float(line[3]) >= min_length:
target = line[1].strip()
aln_start = int(line[8].strip()) - 1
aln_end = int(line[9].strip()) - 1
aln_start, aln_end = sorted([aln_start, aln_end])
for position in range(aln_start, aln_end + 1):
capture_data[target][1][position] += 1
# GARBAGE COLLECTION - Delete BLAST results
os.remove(blast_path)
return capture_data
def write_capture_data(capture_path, capture_data):
print(f'Writing capture results to {capture_path}...')
with open(capture_path, 'w') as output_file:
for header, (seq, depth) in capture_data.items():
output_file.write('>' + header + '\n')
output_file.write('$' + seq + '\n')
output_file.write('#' + ','.join([str(d) for d in depth]) + '\n')
print(f' Wrote capture results for {"{:,}".format(len(capture_data))} targets.')
########## getlowcov functions ##########
def write_low_cov_seqs(capture_data, low_cov_path, k, min_depth, min_length):
print(f'Extracting low coverage areas...')
# Write low cov seqs to FASTA file
with open(low_cov_path, 'w') as output_file:
total_low_cov_counter = 0
for header, (seq, depth) in capture_data.items():
if len(seq) < k:
print(f' WARNING: Seq {header} is shorter than the window length so low coverage seqs cannot be extracted.')
else:
low_cov_counter = 0
low_cov_start = 0
while low_cov_start < len(seq):
while low_cov_start < len(seq) and depth[low_cov_start] > min_depth:
low_cov_start += 1
low_cov_end = low_cov_start
while low_cov_end < len(seq) and depth[low_cov_end] <= min_depth:
low_cov_end += 1
low_cov_seq = seq[low_cov_start:low_cov_end]
if len(low_cov_seq) > min_length and len(low_cov_seq) >= k:
total_low_cov_counter += 1
low_cov_counter += 1
output_file.write(f'>{header}_position_{low_cov_start + 1}_to_{low_cov_end}\n')
output_file.write(low_cov_seq + '\n')
elif len(low_cov_seq) > min_length and len(low_cov_seq) < k:
window_start = low_cov_start - int((k - len(low_cov_seq)) / 2)
window_end = window_start + k
if window_start < 0:
window_end = window_end - window_start
window_start = 0
if window_end > len(seq):
window_start = window_start - window_end + len(seq)
window_end = len(seq)
extended_low_cov_seq = seq[window_start:window_end]
total_low_cov_counter += 1
low_cov_counter += 1
output_file.write(f'>{header}_position_{window_start + 1}_to_{window_end}\n')
output_file.write(extended_low_cov_seq.upper() + '\n')
low_cov_start = low_cov_end
print(f' Wrote {total_low_cov_counter} low coverage seqs from {"{:,}".format(len(capture_data))} targets.')
seqs_writen = total_low_cov_counter
return seqs_writen
########## stats functions ##########
def calc_cov_at_depth(seq, depth, min_depth):
covered_bases = 0
total_bases = 0
for base, base_depth in zip(seq, depth):
if base_depth >= min_depth:
covered_bases += 1
total_bases += 1
elif base.upper() in 'ATGC':
total_bases += 1
if total_bases == 0:
coverage = 0
else:
coverage = covered_bases * 100 / total_bases
return coverage
def calc_cov_percentiles(capture_data, percentiles=(0, 0.05, 0.1, 0.25, 0.5, 0.75, 1)):
"""Takes a capture dict and tuple of perentiles, and returns a list of those percentile
values."""
cov_values = []
for header, (seq, depth) in capture_data.items():
cov_values.append(calc_cov_at_depth(seq, depth, 1))
cov_values = sorted(cov_values)
percentile_values = []
for percentile in percentiles:
values_under_percentile = (len(cov_values) - 1) * percentile
if values_under_percentile == int(values_under_percentile):
percentile_values.append(cov_values[int(values_under_percentile)])
else:
percentile_value = cov_values[int(values_under_percentile)]
percentile_value += cov_values[int(values_under_percentile) + 1]
percentile_value = percentile_value / 2
percentile_values.append(percentile_value)
return [round(percentile_value,2) for percentile_value in percentile_values]
def calc_total_probe_depth(capture_data):
"""Takes a capture dict and returns a tuple containing the percentage of nucleotide positions
in the target space covered by 0, 1, 2, 3, 4, and 5+ probes."""
total = 0
total_0 = 0
total_1 = 0
total_2 = 0
total_3 = 0
total_4 = 0
total_5 = 0
for header, (seq, depth) in capture_data.items():
total += len(depth)
total_0 += depth.count(0)
total_1 += depth.count(1)
total_2 += depth.count(2)
total_3 += depth.count(3)
total_4 += depth.count(4)
total_5 += len([d for d in depth if d >= 5])
total_0 = round(total_0 * 100 / total, 2)
total_1 = round(total_1 * 100 / total, 2)
total_2 = round(total_2 * 100 / total, 2)
total_3 = round(total_3 * 100 / total, 2)
total_4 = round(total_4 * 100 / total, 2)
total_5 = round(total_5 * 100 / total, 2)
return (total_0, total_1, total_2, total_3, total_4, total_5)
def write_summary_report(capture_data, report_path, name):
"""Takes a capture dict and generates the summary report."""
print(f'Writing probe coverage and probe depth summary stats to {report_path}...')
with open(report_path, 'w') as output_file:
header = ['name', 'total_targets']
header += ['cov_' + s for s in ['min', '5%tile', '10%tile', 'Q1', 'med', 'Q3', 'max']]
header += ['depth_' + s for s in ['0', '1', '2', '3', '4', '5+']]
output_file.write('\t'.join(header) + '\n')
line = [name, str(len(capture_data))]
line += [str(f) for f in calc_cov_percentiles(capture_data)]
line += [str(f) for f in calc_total_probe_depth(capture_data)]
output_file.write('\t'.join(line) + '\n')
def write_long_report(capture_data, report_path, name):
"""Takes a capture dict and generates the long-form report."""
print(f'Writing probe coverage and probe depth stats for each target to {report_path}...')
with open(report_path, 'w') as output_file:
header = ['name', 'target', 'length', 'ATGCs', '%_ATGCs']
header += ['bases_' + str(d) + 'X' for d in ['0', '1', '2', '3', '4', '5+']]
header += ['%_cov_' + str(d) + 'X' for d in ['1', '2', '3', '4', '5']]
output_file.write('\t'.join(header) + '\n')
for header, (seq, depth) in capture_data.items():
total_ATGC = len([s for s in seq if s.upper() in 'ATGC'])
length = len(depth)
line = [name, header, str(length), str(total_ATGC), str(round(total_ATGC * 100 / length, 2))]
line += [str(depth.count(d)) for d in [0, 1, 2, 3, 4]]
line += [str(len([d for d in depth if d >= 5]))]
line += [str(round(calc_cov_at_depth(seq, depth, min_depth), 1)) for min_depth in [1, 2, 3, 4, 5]]
output_file.write('\t'.join(line) + '\n')
######### call main function ##########
if __name__ == '__main__':
main()