forked from klebgenomics/Kaptive
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkaptive.py
executable file
·1851 lines (1613 loc) · 75.1 KB
/
kaptive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
Kaptive
This is a tool which reports information about the K and O types for Klebsiella genome assemblies.
It will help a user to decide whether their Klebsiella sample has a known or novel locus type, and
if novel, how similar it is to a known type.
This script needs the following input files to run:
* A Genbank file with known locus types
* One or more assemblies in FASTA format
Example command:
kaptive.py -a path/to/assemblies/*.fasta -k k_loci_refs.gbk -o output_directory
For each input assembly file, Kaptive will identify the closest known locus type and report
information about the corresponding locus genes.
It generates the following output files:
* A FASTA file for each input assembly with the nucleotide sequences matching the closest locus type
* A table summarising the results for all input assemblies
Character codes indicate problems with the locus match:
* `?` indicates that the match was not in a single piece, possible due to a poor match or
discontiguous assembly
* `-` indicates that genes expected in the locus were not found
* `+` indicates that extra genes were found in the locus
* `*` indicates that one or more expected genes was found but with low identity
https://github.com/katholt/kaptive
Author: Ryan Wick
email: rrwick@gmail.com
"""
from __future__ import print_function
from __future__ import division
import argparse
import sys
import os
import multiprocessing
import subprocess
import json
import fcntl
import gzip
import copy
import random
from collections import OrderedDict
from Bio import SeqIO
__version__ = '0.5'
def main():
"""Script execution starts here."""
args = get_argument_parser().parse_args()
check_for_blast()
check_files_exist(args.assembly + [args.k_refs] + [args.allelic_typing])
check_assembly_format(args.assembly)
fix_paths(args)
output_table = not args.no_table
output_json = not args.no_json
temp_dir = make_temp_dir(args)
k_ref_seqs, gene_seqs, k_ref_genes = parse_genbank(args.k_refs, temp_dir, args.locus_label)
all_gene_dict = {}
for gene_list in k_ref_genes.values():
for gene in gene_list:
all_gene_dict[gene.full_name] = gene
k_refs = load_k_locus_references(k_ref_seqs, k_ref_genes)
type_gene_names = get_type_gene_names(args.allelic_typing)
if output_table:
create_table_file(args.out, type_gene_names)
json_list = []
for fasta_file in args.assembly:
assembly = Assembly(fasta_file)
best_k = get_best_k_type_match(assembly, k_ref_seqs, k_refs, args.threads)
if best_k is None:
type_gene_results = {}
best_k = KLocus('None', '', [])
else:
find_assembly_pieces(assembly, best_k, args)
assembly_pieces_fasta = save_assembly_pieces_to_file(best_k, assembly, args.out)
type_gene_results = type_gene_search(assembly_pieces_fasta, type_gene_names, args)
if args.no_seq_out and assembly_pieces_fasta is not None:
os.remove(assembly_pieces_fasta)
protein_blast(assembly, best_k, gene_seqs, args)
check_name_for_o1_o2(best_k)
output(args.out, assembly, best_k, args, type_gene_names, type_gene_results,
json_list, output_table, output_json, all_gene_dict)
if output_json:
write_json_file(args.out, json_list)
clean_up(k_ref_seqs, gene_seqs, temp_dir)
sys.exit(0)
def get_argument_parser():
"""Specifies the command line arguments required by the script."""
parser = argparse.ArgumentParser(description='Kaptive',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
add_arguments_to_parser(parser)
return parser
def add_arguments_to_parser(parser):
parser.add_argument('--version', action='version', version='Kaptive ' + __version__,
help="Show Kaptive's version number and exit")
parser.add_argument('-a', '--assembly', nargs='+', type=str, required=True,
help='FASTA file(s) for assemblies')
parser.add_argument('-k', '--k_refs', type=str, required=True,
help='GenBank file with reference loci')
parser.add_argument('-g', '--allelic_typing', type=str, required=False,
help='SRST2-formatted FASTA file of allelic typing genes to include in '
'results')
parser.add_argument('-o', '--out', type=str, required=False, default='./kaptive_results',
help='Output directory/prefix')
parser.add_argument('-v', '--verbose', action='store_true',
help='Display detailed information about each assembly in stdout')
parser.add_argument('-t', '--threads', type=int, required=False,
default=min(multiprocessing.cpu_count(), 4),
help='The number of threads to use for the BLAST searches')
parser.add_argument('--no_seq_out', action='store_true',
help='Suppress output files of sequences matching locus')
parser.add_argument('--no_table', action='store_true',
help='Suppress output of tab-delimited table')
parser.add_argument('--no_json', action='store_true',
help='Suppress output of JSON file')
parser.add_argument('--start_end_margin', type=int, required=False, default=10,
help='Missing bases at the ends of locus allowed in a perfect match.')
parser.add_argument('--min_gene_cov', type=float, required=False, default=90.0,
help='minimum required %% coverage for genes')
parser.add_argument('--min_gene_id', type=float, required=False, default=80.0,
help='minimum required %% identity for genes')
parser.add_argument('--low_gene_id', type=float, required=False, default=95.0,
help='genes with a %% identity below this value will be flagged as low '
'identity')
parser.add_argument('--min_assembly_piece', type=int, required=False, default=100,
help='minimum locus matching assembly piece to return')
parser.add_argument('--gap_fill_size', type=int, required=False, default=100,
help='when separate parts of the assembly are found within this distance, '
'they will be merged')
parser.add_argument('--locus_label', type=str, required=False,
default='automatically determined',
help='In the Genbank file, the source feature must have a note '
'identifying the locus name, starting with this label followed by '
'a colon (e.g. /note="K locus: K1")')
def check_for_blast():
"""Checks to make sure the required BLAST+ tools are available."""
if not find_program('makeblastdb'):
quit_with_error('could not find makeblastdb tool (part of BLAST+)')
if not find_program('blastn'):
quit_with_error('could not find blastn tool (part of BLAST+)')
if not find_program('tblastn'):
quit_with_error('could not find tblastn tool (part of BLAST+)')
def find_program(name):
"""Checks to see if a program exists."""
process = subprocess.Popen(['which', name], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = process.communicate()
return bool(out) and not bool(err)
def fix_paths(args):
"""
Changes the paths given by the user to absolute paths, which are easier to work with later.
Also creates the output directory, if necessary.
"""
args.assembly = [os.path.abspath(x) for x in args.assembly]
args.k_refs = os.path.abspath(args.k_refs)
if args.out[-1] == '/':
args.out += 'kaptive_results'
args.out = os.path.abspath(args.out)
out_dir = os.path.dirname(args.out)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
def make_temp_dir(args):
"""Makes the temporary directory, if necessary. Returns the temp directory path."""
temp_dir_name = 'kaptive_temp_' + str(os.getpid()) + '_' + str(random.randint(0, 999999))
temp_dir = os.path.join(os.path.dirname(args.out), temp_dir_name)
if not os.path.exists(temp_dir):
os.makedirs(temp_dir)
return temp_dir
def clean_up(k_ref_seqs, gene_seqs, temp_dir):
"""
Deletes the temporary FASTA files. If the temp directory is then empty, it is deleted too.
"""
try:
os.remove(k_ref_seqs)
except OSError:
pass
try:
os.remove(gene_seqs)
except OSError:
pass
try:
if not os.listdir(temp_dir):
os.rmdir(temp_dir)
except FileNotFoundError:
pass
def parse_genbank(genbank, temp_dir, locus_label):
"""
This function reads the input Genbank file and produces two temporary FASTA files: one with the
loci nucleotide sequences and one with the gene sequences.
It returns the file paths for these two FASTA files along with a dictionary that links genes to
loci.
"""
k_ref_genes = {}
k_ref_seqs_filename = os.path.join(temp_dir, 'temp_k_ref_seqs.fasta')
gene_seqs_filename = os.path.join(temp_dir, 'temp_gene_seqs.fasta')
k_ref_seqs = open(k_ref_seqs_filename, 'wt')
gene_seqs = open(gene_seqs_filename, 'wt')
if locus_label == 'automatically determined':
locus_label = find_locus_label(genbank)
else:
check_locus_label(genbank, locus_label)
for record in SeqIO.parse(genbank, 'genbank'):
k_locus_name = ''
for feature in record.features:
if feature.type == 'source' and 'note' in feature.qualifiers:
for note in feature.qualifiers['note']:
if note.startswith(locus_label):
k_locus_name = get_locus_name_from_note(note, locus_label)
elif note.startswith('Extra genes'):
k_locus_name = note.replace(':', '').replace(' ', '_')
if k_locus_name in k_ref_genes:
quit_with_error('Duplicate reference locus name: ' + k_locus_name)
k_ref_genes[k_locus_name] = []
# Extra genes are only used for the gene search, not the nucleotide search.
if not k_locus_name.startswith('Extra_genes'):
k_ref_seqs.write('>' + k_locus_name + '\n')
k_ref_seqs.write(add_line_breaks_to_sequence(str(record.seq), 60))
gene_num = 1
for feature in record.features:
if feature.type == 'CDS':
gene = Gene(k_locus_name, gene_num, feature, record.seq)
k_ref_genes[k_locus_name].append(gene)
gene_num += 1
gene_seqs.write(gene.get_fasta())
k_ref_seqs.close()
gene_seqs.close()
return k_ref_seqs_filename, gene_seqs_filename, k_ref_genes
def find_locus_label(genbank):
"""
Automatically finds the label for the locus sequences. The Genbank file must have exactly one
possible label that is present in a note qualifier in the source feature for every record. If
not, Kaptive will quit with an error.
"""
possible_locus_labels = set()
for record in SeqIO.parse(genbank, 'genbank'):
for feature in record.features:
if feature.type == 'source' and 'note' in feature.qualifiers:
for note in feature.qualifiers['note']:
if ':' in note:
possible_locus_labels.add(note.split(':')[0].strip())
if '=' in note:
possible_locus_labels.add(note.split('=')[0].strip())
if not possible_locus_labels:
quit_with_error('None of the records contain a valid locus label')
available_locus_labels = possible_locus_labels.copy()
for record in SeqIO.parse(genbank, 'genbank'):
locus_labels = set()
for feature in record.features:
if feature.type == 'source' and 'note' in feature.qualifiers:
for note in feature.qualifiers['note']:
if ':' in note:
locus_labels.add(note.split(':')[0].strip())
if '=' in note:
locus_labels.add(note.split('=')[0].strip())
if any(x == 'Extra genes' for x in locus_labels):
continue
if not locus_labels:
quit_with_error('no possible locus labels were found for ' + record.name)
previous_labels = available_locus_labels.copy()
available_locus_labels = available_locus_labels.intersection(locus_labels)
if not available_locus_labels:
error_message = record.name + ' does not have a locus label matching the previous ' \
'records\n'
error_message += 'Previous record labels: ' + ', '.join(list(previous_labels)) + '\n'
error_message += 'Labels in ' + record.name + ': ' + ', '.join(list(locus_labels))
quit_with_error(error_message)
if len(available_locus_labels) > 1:
error_message = 'multiple possible locus labels were found: ' + \
', '.join(list(available_locus_labels)) + '\n'
error_message += 'Please use the --locus_label option to specify which to use'
quit_with_error(error_message)
return list(available_locus_labels)[0]
def check_locus_label(genbank, locus_label):
"""
Makes sure that every record in the Genbank file contains a note in the source feature
beginning with the given label.
"""
for record in SeqIO.parse(genbank, 'genbank'):
found_label = False
for feature in record.features:
if feature.type == 'source' and 'note' in feature.qualifiers:
for note in feature.qualifiers['note']:
if note.startswith(locus_label):
k_locus_name = get_locus_name_from_note(note, locus_label)
if k_locus_name:
found_label = True
if not found_label:
error_message = record.name + ' is missing a locus label\n'
error_message += 'The source feature must have a note qualifier beginning with "' + \
locus_label + ':" followed by the locus name'
quit_with_error(error_message)
def get_locus_name_from_note(full_note, locus_label):
"""
Extracts the part of the note following the label (and any colons, spaces or equals signs).
"""
locus_name = full_note[len(locus_label):].strip()
while locus_name.startswith(':') or locus_name.startswith(' ') or \
locus_name.startswith('='):
locus_name = locus_name[1:]
return locus_name
def check_files_exist(filenames):
"""Checks to make sure each file in the given list exists."""
for filename in filenames:
if filename is not None:
check_file_exists(filename)
def check_file_exists(filename):
"""Checks to make sure the single given file exists."""
if not os.path.isfile(filename):
quit_with_error('could not find ' + filename)
def check_assembly_format(filenames):
"""Tries to load each assembly and shows an error if it did not successfully load."""
for assembly in filenames:
fasta = load_fasta(assembly)
if len(fasta) < 1:
quit_with_error('invalid FASTA file: ' + assembly)
for record in fasta:
header, seq = record
if len(seq) == 0:
quit_with_error('invalid FASTA file (contains a zero-length sequence): ' +
assembly)
def quit_with_error(message):
"""Displays the given message and ends the program's execution."""
print('Error:', message, file=sys.stderr)
sys.exit(1)
def get_best_k_type_match(assembly, k_refs_fasta, k_refs, threads):
"""
Searches for all known locus types in the given assembly and returns the best match.
Best match is defined as the locus type for which the largest fraction of the locus has a BLAST
hit to the assembly. In cases of a tie, the mean identity of the locus type BLAST hits are used
to determine the best.
"""
for k_ref in k_refs.values():
k_ref.clear()
blast_hits = get_blast_hits(assembly.fasta, k_refs_fasta, threads)
for hit in blast_hits:
if hit.qseqid not in k_refs:
quit_with_error('BLAST hit (' + hit.qseqid + ') not found in locus references')
k_refs[hit.qseqid].add_blast_hit(hit)
best_k_ref = None
best_cov = 0.0
for k_ref in k_refs.values():
cov = k_ref.get_coverage()
if cov > best_cov:
best_cov = cov
best_k_ref = k_ref
elif cov == best_cov and best_k_ref and \
k_ref.get_mean_blast_hit_identity() > best_k_ref.get_mean_blast_hit_identity():
best_k_ref = k_ref
if best_k_ref is not None:
best_k_ref.clean_up_blast_hits()
return copy.copy(best_k_ref)
def type_gene_search(assembly_pieces_fasta, type_gene_names, args):
if not type_gene_names or not args.allelic_typing:
return {}
if not assembly_pieces_fasta:
return {x: None for x in type_gene_names}
makeblastdb(assembly_pieces_fasta)
all_gene_blast_hits = get_blast_hits(assembly_pieces_fasta, args.allelic_typing, args.threads,
type_genes=True)
clean_blast_db(assembly_pieces_fasta)
# Filter out small hits.
all_gene_blast_hits = [x for x in all_gene_blast_hits
if x.query_cov >= args.min_gene_cov and x.pident >= args.min_gene_id]
type_gene_results = {}
for gene_name in type_gene_names:
blast_hits = sorted([x for x in all_gene_blast_hits if x.gene_name == gene_name],
reverse=True, key=lambda z: z.bitscore)
if not blast_hits:
hit = None
else:
perfect_match = None
for hit in blast_hits:
if hit.pident == 100.0 and hit.query_cov == 100.0:
perfect_match = hit
break
if perfect_match:
hit = perfect_match
hit.result = str(perfect_match.allele_number)
else:
hit = blast_hits[0]
hit.result = str(blast_hits[0].allele_number) + '*'
type_gene_results[gene_name] = hit
return type_gene_results
def find_assembly_pieces(assembly, k_locus, args):
"""
This function uses the BLAST hits in the given locus type to find the corresponding pieces of
the given assembly. It saves its results in the KLocus object.
"""
if not k_locus.blast_hits:
return
assembly_pieces = [x.get_assembly_piece(assembly) for x in k_locus.blast_hits]
merged_pieces = merge_assembly_pieces(assembly_pieces)
length_filtered_pieces = [x for x in merged_pieces if x.get_length() >= args.min_assembly_piece]
if not length_filtered_pieces:
return
k_locus.assembly_pieces = fill_assembly_piece_gaps(length_filtered_pieces, args.gap_fill_size)
# Now check to see if the biggest assembly piece seems to capture the whole locus. If so, this
# is an ideal match.
biggest_piece = sorted(k_locus.assembly_pieces, key=lambda z: z.get_length(), reverse=True)[0]
start = biggest_piece.earliest_hit_coordinate()
end = biggest_piece.latest_hit_coordinate()
if good_start_and_end(start, end, k_locus.get_length(), args.start_end_margin):
k_locus.assembly_pieces = [biggest_piece]
# If it isn't the ideal case, we still want to check if the start and end of the locus were
# found in the same contig. If so, fill all gaps in between so we include the entire
# intervening sequence.
else:
earliest, latest, same_contig_and_strand = k_locus.get_earliest_and_latest_pieces()
k_start = earliest.earliest_hit_coordinate()
k_end = latest.latest_hit_coordinate()
if good_start_and_end(k_start, k_end, k_locus.get_length(), args.start_end_margin) and \
same_contig_and_strand:
gap_filling_piece = AssemblyPiece(assembly, earliest.contig_name, earliest.start,
latest.end, earliest.strand)
k_locus.assembly_pieces = merge_assembly_pieces(k_locus.assembly_pieces +
[gap_filling_piece])
k_locus.identity = get_mean_identity(k_locus.assembly_pieces)
def protein_blast(assembly, k_locus, gene_seqs, args):
"""
Conducts a BLAST search of all known locus proteins. Stores the results in the KLocus
object.
"""
hits = get_blast_hits(assembly.fasta, gene_seqs, args.threads, genes=True)
hits = [x for x in hits if x.query_cov >= args.min_gene_cov and x.pident >= args.min_gene_id]
expected_hits = []
for expected_gene in k_locus.gene_names:
best_hit = get_best_hit_for_query(hits, expected_gene, k_locus)
if not best_hit:
k_locus.missing_expected_genes.append(expected_gene)
else:
best_hit.over_identity_threshold = best_hit.pident >= args.low_gene_id
expected_hits.append(best_hit)
hits = [x for x in hits if x is not best_hit]
hits = cull_conflicting_hits(best_hit, hits)
other_hits = cull_all_conflicting_hits(hits)
k_locus.expected_hits_inside_locus = [x for x in expected_hits
if x.in_assembly_pieces(k_locus.assembly_pieces)]
k_locus.expected_hits_outside_locus = [x for x in expected_hits
if not x.in_assembly_pieces(k_locus.assembly_pieces)]
k_locus.other_hits_inside_locus = [x for x in other_hits
if x.in_assembly_pieces(k_locus.assembly_pieces)]
k_locus.other_hits_outside_locus = [x for x in other_hits
if not x.in_assembly_pieces(k_locus.assembly_pieces)]
def create_table_file(output_prefix, type_gene_names):
"""
Creates the table file and writes a header line if necessary.
If the file already exists and the header line is correct, then it does nothing (to allow
multiple independent processes to append to the file).
"""
table_path = output_prefix + '_table.txt'
# If the table already exists, we don't need to do anything.
if os.path.isfile(table_path):
with open(table_path, 'r') as existing_table:
first_line = existing_table.readline().strip()
if first_line.startswith('Assembly\tBest match locus'):
return
headers = ['Assembly',
'Best match locus',
'Match confidence',
'Problems',
'Coverage',
'Identity',
'Length discrepancy',
'Expected genes in locus',
'Expected genes in locus, details',
'Missing expected genes',
'Other genes in locus',
'Other genes in locus, details',
'Expected genes outside locus',
'Expected genes outside locus, details',
'Other genes outside locus',
'Other genes outside locus, details']
if type_gene_names:
headers += type_gene_names
with open(table_path, 'w') as table:
table.write('\t'.join(headers))
table.write('\n')
def get_type_gene_names(type_genes_fasta):
gene_names = []
if type_genes_fasta:
gene_names = set()
with open(type_genes_fasta, 'rt') as type_genes_db:
for line in type_genes_db:
if line.startswith('>'):
try:
gene_names.add(line.split('>')[1].split('__')[1])
except IndexError:
quit_with_error(type_genes_fasta + ' not formatted as an SRST2 database '
'FASTA file')
if not gene_names:
quit_with_error(type_genes_fasta + ' not formatted as an SRST2 database FASTA file')
gene_names = sorted(list(gene_names))
return gene_names
def check_name_for_o1_o2(k_locus):
"""
This function has special logic for dealing with the O1/O2 locus. If the wbbY and wbbZ genes
are both found, then we call the locus O2 (instead of O1/O2). If neither are found, then we
call the locus O1.
"""
if not (k_locus.name == 'O1/O2v1' or k_locus.name == 'O1/O2v2'):
return
other_gene_names = [x.qseqid for x in k_locus.other_hits_outside_locus]
both_present = ('Extra_genes_wbbY/wbbZ_01_wbbY' in other_gene_names and
'Extra_genes_wbbY/wbbZ_02_wbbZ' in other_gene_names)
both_absent = ('Extra_genes_wbbY/wbbZ_01_wbbY' not in other_gene_names and
'Extra_genes_wbbY/wbbZ_02_wbbZ' not in other_gene_names)
if both_present:
k_locus.name = k_locus.name.replace('O1/O2', 'O1')
elif both_absent:
k_locus.name = k_locus.name.replace('O1/O2', 'O2')
def output(output_prefix, assembly, k_locus, args, type_gene_names, type_gene_results,
json_list, output_table, output_json, all_gene_dict):
"""
Writes a line to the output table describing all that we've learned about the given locus and
writes to stdout as well.
"""
uncertainty_chars = k_locus.get_match_uncertainty_chars()
try:
expected_in_locus_per = 100.0 * len(k_locus.expected_hits_inside_locus) / \
len(k_locus.gene_names)
expected_out_locus_per = 100.0 * len(k_locus.expected_hits_outside_locus) / \
len(k_locus.gene_names)
expected_genes_in_locus_str = str(len(k_locus.expected_hits_inside_locus)) + ' / ' + \
str(len(k_locus.gene_names)) + ' (' + float_to_str(expected_in_locus_per) + '%)'
expected_genes_out_locus_str = str(len(k_locus.expected_hits_outside_locus)) + ' / ' + \
str(len(k_locus.gene_names)) + ' (' + float_to_str(expected_out_locus_per) + '%)'
missing_per = 100.0 * len(k_locus.missing_expected_genes) / len(k_locus.gene_names)
missing_genes_str = str(len(k_locus.missing_expected_genes)) + ' / ' + \
str(len(k_locus.gene_names)) + ' (' + float_to_str(missing_per) + '%)'
except ZeroDivisionError:
expected_genes_in_locus_str, expected_genes_out_locus_str, missing_genes_str = '', '', ''
output_to_stdout(assembly, k_locus, args.verbose, type_gene_names, type_gene_results,
uncertainty_chars, expected_genes_in_locus_str, expected_genes_out_locus_str,
missing_genes_str)
if output_table:
output_to_table(output_prefix, assembly, k_locus, type_gene_names, type_gene_results,
uncertainty_chars, expected_genes_in_locus_str,
expected_genes_out_locus_str)
if output_json:
add_to_json(assembly, k_locus, type_gene_names, type_gene_results, json_list,
uncertainty_chars, all_gene_dict)
def output_to_table(output_prefix, assembly, k_locus, type_gene_names, type_gene_results,
uncertainty_chars, expected_genes_in_locus_str, expected_genes_out_locus_str):
line = [assembly.name,
k_locus.name,
k_locus.get_match_confidence(),
uncertainty_chars,
k_locus.get_coverage_string(),
k_locus.get_identity_string(),
k_locus.get_length_discrepancy_string(),
expected_genes_in_locus_str,
get_gene_info_string(k_locus.expected_hits_inside_locus),
';'.join(k_locus.missing_expected_genes),
str(len(k_locus.other_hits_inside_locus)),
get_gene_info_string(k_locus.other_hits_inside_locus),
expected_genes_out_locus_str,
get_gene_info_string(k_locus.expected_hits_outside_locus),
str(len(k_locus.other_hits_outside_locus)),
get_gene_info_string(k_locus.other_hits_outside_locus)]
for gene_name in type_gene_names:
hit = type_gene_results[gene_name]
line.append('-' if not hit else hit.result)
table_path = output_prefix + '_table.txt'
table = open(table_path, 'at')
table.write('\t'.join(line))
table.write('\n')
table.close()
def add_to_json(assembly, k_locus, type_gene_names, type_gene_results, json_list,
uncertainty_chars, all_gene_dict):
json_record = OrderedDict()
json_record['Assembly name'] = assembly.name
match_dict = OrderedDict()
match_dict['Locus name'] = k_locus.name
match_dict['Match confidence'] = k_locus.get_match_confidence()
reference_dict = OrderedDict()
reference_dict['Length'] = len(k_locus.seq)
reference_dict['Sequence'] = k_locus.seq
match_dict['Reference'] = reference_dict
json_record['Best match'] = match_dict
problems = OrderedDict()
problems['Locus assembled in multiple pieces'] = str('?' in uncertainty_chars)
problems['Missing genes in locus'] = str('-' in uncertainty_chars)
problems['Extra genes in locus'] = str('+' in uncertainty_chars)
problems['At least one low identity gene'] = str('*' in uncertainty_chars)
json_record['Problems'] = problems
blast_results = OrderedDict()
blast_results['Coverage'] = k_locus.get_coverage_string()
blast_results['Identity'] = k_locus.get_identity_string()
blast_results['Length discrepancy'] = k_locus.get_length_discrepancy_string()
assembly_pieces = []
for i, piece in enumerate(k_locus.assembly_pieces):
assembly_piece = OrderedDict()
assembly_piece['Contig name'] = piece.contig_name
assembly_piece['Contig start position'] = piece.start + 1
assembly_piece['Contig end position'] = piece.end
assembly_piece['Contig strand'] = piece.strand
piece_seq = piece.get_sequence()
assembly_piece['Length'] = len(piece_seq)
assembly_piece['Sequence'] = piece_seq
assembly_pieces.append(assembly_piece)
blast_results['Locus assembly pieces'] = assembly_pieces
json_record['blastn result'] = blast_results
expected_genes_in_locus = {x.qseqid: x for x in k_locus.expected_hits_inside_locus}
expected_hits_outside_locus = {x.qseqid: x for x in k_locus.expected_hits_outside_locus}
other_hits_inside_locus = {x.qseqid: x for x in k_locus.other_hits_inside_locus}
other_hits_outside_locus = {x.qseqid: x for x in k_locus.other_hits_outside_locus}
k_locus_genes = []
for gene in k_locus.genes:
gene_dict = OrderedDict()
gene_name = gene.full_name
gene_dict['Name'] = gene_name
if gene_name in expected_genes_in_locus:
gene_dict['Result'] = 'Found in locus'
elif gene_name in expected_hits_outside_locus:
gene_dict['Result'] = 'Found outside locus'
else:
gene_dict['Result'] = 'Not found'
gene_dict['Reference'] = gene.get_reference_info_json_dict()
if gene_name in expected_genes_in_locus or gene_name in expected_hits_outside_locus:
if gene_name in expected_genes_in_locus:
hit = expected_genes_in_locus[gene_name]
else:
hit = expected_hits_outside_locus[gene_name]
gene_dict['tblastn result'] = hit.get_blast_result_json_dict(assembly)
gene_dict['Match confidence'] = hit.get_match_confidence()
else:
gene_dict['Match confidence'] = 'Not found'
k_locus_genes.append(gene_dict)
json_record['Locus genes'] = k_locus_genes
extra_genes = OrderedDict()
for gene_name, hit in other_hits_inside_locus.items():
gene_dict = OrderedDict()
gene = all_gene_dict[gene_name]
gene_dict['Reference'] = gene.get_reference_info_json_dict()
gene_dict['tblastn result'] = hit.get_blast_result_json_dict(assembly)
extra_genes[gene_name] = gene_dict
json_record['Other genes in locus'] = extra_genes
other_genes = OrderedDict()
for gene_name, hit in other_hits_outside_locus.items():
gene_dict = OrderedDict()
gene = all_gene_dict[gene_name]
gene_dict['Reference'] = gene.get_reference_info_json_dict()
gene_dict['tblastn result'] = hit.get_blast_result_json_dict(assembly)
other_genes[gene_name] = gene_dict
json_record['Other genes outside locus'] = other_genes
if type_gene_names:
allelic_typing = OrderedDict()
for gene_name in type_gene_names:
allelic_type = OrderedDict()
if not type_gene_results[gene_name]:
allelic_type['Allele'] = 'Not found'
else:
blast_hit = type_gene_results[gene_name]
allele = blast_hit.result
if allele.endswith('*'):
perfect_match = False
allele = allele[:-1]
else:
perfect_match = True
try:
allele = int(allele)
except ValueError:
pass
allelic_type['Allele'] = allele
allelic_type['Perfect match'] = str(perfect_match)
allelic_type['blastn result'] = blast_hit.get_blast_result_json_dict(assembly)
allelic_typing[gene_name] = allelic_type
json_record['Allelic_typing'] = allelic_typing
json_list.append(json_record)
def write_json_file(output_prefix, json_list):
json_filename = output_prefix + '.json'
if not os.path.isfile(json_filename):
with open(output_prefix + '.json', 'wt') as json_out:
fcntl.flock(json_out, fcntl.LOCK_EX)
json_out.write(json.dumps(json_list, indent=4))
json_out.write('\n')
fcntl.flock(json_out, fcntl.LOCK_UN)
else:
with open(output_prefix + '.json', 'r+t') as json_out:
fcntl.flock(json_out, fcntl.LOCK_EX)
file_data = json_out.read()
try:
existing_json_list = json.loads(file_data, object_pairs_hook=OrderedDict)
json_list = existing_json_list + json_list
except ValueError:
pass
json_out.seek(0)
json_out.write(json.dumps(json_list, indent=4))
json_out.write('\n')
json_out.truncate()
fcntl.flock(json_out, fcntl.LOCK_UN)
def output_to_stdout(assembly, k_locus, verbose, type_gene_names, type_gene_results,
uncertainty_chars, expected_genes_in_locus_str, expected_genes_out_locus_str,
missing_genes_str):
if verbose:
print()
assembly_name_line = 'Assembly: ' + assembly.name
print(assembly_name_line)
print('-' * len(assembly_name_line))
print(' Best match locus: ' + k_locus.name)
print(' Match confidence: ' + k_locus.get_match_confidence())
print(' Problems: ' + (uncertainty_chars if uncertainty_chars else 'None'))
print(' Coverage: ' + k_locus.get_coverage_string())
print(' Identity: ' + k_locus.get_identity_string())
print(' Length discrepancy: ' + k_locus.get_length_discrepancy_string())
print()
print_assembly_pieces(k_locus.assembly_pieces)
print_gene_hits('Expected genes in locus: ' + expected_genes_in_locus_str,
k_locus.expected_hits_inside_locus)
print_gene_hits('Expected genes outside locus: ' + expected_genes_out_locus_str,
k_locus.expected_hits_outside_locus)
print(' Missing expected genes: ' + missing_genes_str)
for missing_gene in k_locus.missing_expected_genes:
print(' ' + missing_gene)
print()
print_gene_hits('Other genes in locus: ' + str(len(k_locus.other_hits_inside_locus)),
k_locus.other_hits_inside_locus)
print_gene_hits('Other genes outside locus: ' + str(len(k_locus.other_hits_outside_locus)),
k_locus.other_hits_outside_locus)
for gene_name in type_gene_names:
result = 'Not found' if not type_gene_results[gene_name] \
else type_gene_results[gene_name].result
print(' ' + gene_name + ' allele: ' + result)
print()
else: # not verbose
simple_output = assembly.name + ': ' + k_locus.name + uncertainty_chars
for gene_name in type_gene_names:
result = 'Not found' if not type_gene_results[gene_name] \
else type_gene_results[gene_name].result
simple_output += ', ' + gene_name + '=' + result
print(simple_output)
def print_assembly_pieces(pieces):
"""This function prints assembly pieces nicely for verbose output."""
print(' Locus assembly pieces:')
if pieces:
longest_header = max([len(x.get_nice_header()) for x in pieces])
for piece in pieces:
first_part = piece.get_nice_header()
first_part = first_part.ljust(longest_header)
second_part = piece.get_sequence_short()
print(' ' + first_part + ' ' + second_part)
print()
def print_gene_hits(title, hits):
"""This function prints gene hits nicely for verbose output."""
print(' ' + title)
if hits:
longest_gene_name = max([len(x.qseqid) for x in hits])
longest_contig_details = max([len(x.get_contig_details_string()) for x in hits])
longest_coverage_details = max([len(x.get_coverage_details_string()) for x in hits])
cov_space = max([x.query_cov for x in hits]) == 100.0
id_space = max([x.pident for x in hits]) == 100.0
spacing_1 = longest_gene_name + 2
spacing_2 = spacing_1 + longest_contig_details + 2
spacing_3 = spacing_2 + longest_coverage_details + 2
for hit in hits:
print(' ' + hit.get_aligned_string(spacing_1, spacing_2, spacing_3,
cov_space, id_space))
print()
def float_to_str(float_in):
"""
This function converts a float to a string in a special manner: if the float is an integer,
the resulting string has no decimal point. Otherwise, one decimal point is used.
"""
if float_in == int(float_in):
return str(int(float_in))
else:
return '%.1f' % float_in
def get_blast_hits(database, query, threads, genes=False, type_genes=False):
"""Returns a list BlastHit objects for a search of the given query in the given database."""
if genes:
command = ['tblastn',
'-db_gencode', '11', # bacterial translation table
'-seg', 'no'] # don't filter out low complexity regions
else:
command = ['blastn', '-task', 'blastn']
command += ['-db', database, '-query', query, '-num_threads', str(threads), '-outfmt',
'6 qseqid sseqid qstart qend sstart send evalue bitscore length pident qlen qseq '
'sseq']
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = process.communicate()
out = convert_bytes_to_str(out)
err = convert_bytes_to_str(err)
if err:
quit_with_error(command[0] + ' encountered an error:\n' + err)
if process.returncode != 0:
msg = command[0] + ' crashed!\n'
# A known crash can occur with tblastn and recent versions of BLAST+ when multiple threads
# are used. Check for this case and display an informative error message if so.
version = get_blast_version(command[0])
bad_version = (version == '2.4.0') or (version == '2.5.0') or (version == '2.6.0')
if threads > 1 and bad_version:
msg += '\nYou are using BLAST+ v' + version + ' which may crash when running with '
msg += 'multiple threads.\n\n'
msg += 'To avoid this issue, try one of the following:\n'
msg += ' 1) Use an unaffected version of BLAST+ (v2.3.0 or earlier should work)\n'
msg += ' 2) Run Kaptive with "--threads 1" (will probably be slower)\n'
quit_with_error(msg)
if genes:
blast_hits = [GeneBlastHit(line) for line in line_iterator(out)]
elif type_genes:
blast_hits = [TypeGeneBlastHit(line) for line in line_iterator(out)]
else:
blast_hits = [BlastHit(line) for line in line_iterator(out)]
return blast_hits
def get_blast_version(program):
command = [program, '-version']
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = process.communicate()
out = convert_bytes_to_str(out)
try:
return out.split(': ')[1].split()[0].split('+')[0]
except IndexError:
return ''
def get_best_hit_for_query(blast_hits, query_name, k_locus):
"""
Given a list of BlastHits, this function returns the best hit for the given query, based first
on whether or not the hit is in the assembly pieces, then on bit score.
It returns None if no BLAST hits match that query.
"""
matching_hits = [x for x in blast_hits if x.qseqid == query_name]
if matching_hits:
return sorted(matching_hits,
key=lambda z: (z.in_assembly_pieces(k_locus.assembly_pieces), z.bitscore),
reverse=True)[0]
else:
return None
def cull_conflicting_hits(hit_to_keep, blast_hits):
"""
This function returns a (potentially) reduced set of BLAST hits which excludes BLAST hits that
overlap too much (same part of assembly) with the hit to keep.
"""
return [x for x in blast_hits if not x.conflicts(hit_to_keep)]
def cull_all_conflicting_hits(blast_hits):
"""
This function returns a (potentially) reduced set of BLAST hits where none of the remaining
hits conflict.
"""
blast_hits.sort(key=lambda x: x.bitscore, reverse=True)
kept_hits = []
while blast_hits:
kept_hits.append(blast_hits.pop(0))
blast_hits = cull_conflicting_hits(kept_hits[-1], blast_hits)
return kept_hits
def merge_assembly_pieces(pieces):
"""
Takes a list of AssemblyPiece objects and returns another list of AssemblyPiece objects where
the overlapping pieces have been merged.
"""
while True:
merged_pieces = []
merge_count = 0
while pieces:
merged_piece = pieces[0]
unmerged = []
for other_piece in pieces[1:]:
combined = merged_piece.combine(other_piece)
if not combined:
unmerged.append(other_piece)
else:
merged_piece = combined
merge_count += 1
merged_pieces.append(merged_piece)
pieces = unmerged
if merge_count == 0:
break
else:
pieces = merged_pieces
return merged_pieces
def fill_assembly_piece_gaps(pieces, max_gap_fill_size):
"""
This function takes a list of assembly pieces, and if any of them are close enough to each
other, the gap will be merged in.
It assumes that all given pieces are from the same assembly.
"""
pieces_by_contig_and_strand = {}
fixed_pieces = []
for piece in pieces: