-
Notifications
You must be signed in to change notification settings - Fork 6
/
filter_operon_blast_results.py.bak
executable file
·444 lines (362 loc) · 21.2 KB
/
filter_operon_blast_results.py.bak
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
#!/usr/bin/python
from multiprocessing import Pool
import time
import os
import sys
import argparse
import math
from homolog4 import *
from collections import defaultdict
import itertools
# Copyright(C) 2015 David Ream
# Released under GPL version 3 licence. http://www.gnu.org/licenses/lgpl.html
# Do not remove this comment
# This exists to make the main function easier to read. It contains code to run the argument parser, and does nothing else.
def parser_code():
parser = argparse.ArgumentParser(description="This program will be used to remove spurious results from a BLAST search organized by gene block.")
parser.add_argument("-i", "--infolder", dest="infolder", default='./blast_parse/', metavar="DIRECTORY",
help="A folder that contains the gene block BLAST results.")
parser.add_argument("-o", "--outfolder", dest="outfolder", metavar="DIRECTORY", default='./optimized_gene_block/',
help="Folder where the filtered results will be stored. Default is the folder './optimized_gene_block/'.")
parser.add_argument("-f", "--filter", dest="filter", default='', metavar="FILE",
help="A file that contains the gene blocks that are under investigation. All others will be omitted from analysis an results.")
parser.add_argument("-n", "--num_proc", dest="num_proc", metavar="INT", default = os.sysconf("SC_NPROCESSORS_CONF"), type=int,
help="Number of processors that you want this script to run on. The default is every CPU that the system has.")
parser.add_argument("-e", "--eval", dest="eval", default='1e-10', metavar="FLOAT", type=float,
help="Use this option to change the eval for the BLAST search that is permitted. Useful if you would like to investigate what altering the eval threshold will do to your results.")
parser.add_argument("-g", "--max_gap", dest="max_gap", metavar="INT", default = 500, type=int,
help="Largest allowable gap to be considered a gene block by the analysis.")
parser.add_argument("-q", "--quiet", dest="quiet", action="store_true", default=False,
help="Suppresses most program text outputs.")
return parser.parse_args()
def check_options(parsed_args):
if os.path.isdir(parsed_args.infolder):
infolder = parsed_args.infolder
else:
print "The infolder directory %s does not exist." % parsed_args.infolder
sys.exit()
# if the directory that the user specifies does not exist, then the program makes it for them.
if not os.path.isdir(parsed_args.outfolder):
os.makedirs(parsed_args.outfolder)
outfolder = parsed_args.outfolder
if outfolder[-1] != '/':
outfolder = outfolder + '/'
if os.path.exists(parsed_args.filter):
filter_file = parsed_args.filter
elif parsed_args.filter == '':
filter_file = parsed_args.filter
else:
print "The filter file %s does not exist." % parsed_args.filter
sys.exit()
# section of code that deals determining the number of CPU cores that will be used by the program
if parsed_args.num_proc > os.sysconf("SC_NPROCESSORS_CONF"):
num_proc = os.sysconf("SC_NPROCESSORS_CONF")
elif parsed_args.num_proc < 1:
num_proc = 1
else:
num_proc = int(parsed_args.num_proc)
# validate the input for the eval
try:
e_val = float(parsed_args.eval)
except:
print "The e-value you entered is not a floating point number, please enter a floating point number, ex. '1e-3', or '12'."
sys.exit()
# validate the input for the maximum allowed gap
try:
max_gap = int(parsed_args.max_gap)
if max_gap <= 0:
print "The gap that you entered %s is a negative number, please enter a positive integer." % parsed_args.max_gap
sys.exit()
else:
pass
except:
print "The gap that you entered %s is not an integer, please enter a positive integer." % parsed_args.max_gap
sys.exit()
quiet = parsed_args.quiet
return infolder, outfolder, filter_file, num_proc, e_val, max_gap, quiet
#this function will return all of the files that are in a directory. os.walk is recursive traversal.
def return_recursive_dir_files(root_dir):
result = []
for path, dir_name, flist in os.walk(root_dir):
for f in flist:
fname = os.path.join(path, f)
if os.path.isfile(fname):
result.append(fname)
return result
def return_file_list(infolder, filter_file):
if filter_file == '':
return return_recursive_dir_files(infolder)
else:
filter_list = [i.strip() for i in open(filter_file)]
return [i for i in return_recursive_dir_files(infolder) if os.path.basename(i).split('.')[0] in filter_list]
# This function will take a BLAST tabular result, and remove any hits that are worse than the eval threshold provided
# The return will be a list of those hits as homolog objects.
def filter_eval(fname, e_val):
# make a list of homolog class objects
h_list = [Homolog.from_blast(i) for i in open(fname).readlines()]
result = list(filter((lambda x: x.e_val() <= e_val), h_list))
return result
def resolve_multiple_ORF_hits(hlist):
result = [hlist[0]]
curr_homolog = hlist[0]
for next_homolog in hlist[1:]:
# we have multiple BLAST hits that share a ORF, resolve using e_val
if curr_homolog.start() == next_homolog.start():
# If current homolog has better eval
if curr_homolog.e_val() <= next_homolog.e_val():
# print curr_homolog.organism(), curr_homolog.locus(), "is duplicated"
pass
# The current homolog has a worse eval, remove for the better example
else:
result.pop(-1)
result.append(next_homolog)
#print "This totally worked", next_homolog.organism()
else:
result.append(next_homolog)
# Now that we are done testing the current and next homolog against
curr_homolog = next_homolog
return result
# The purpose of this function is to take a list of homologs, that have been e_val (or potenitally another means) filtered.
# The return is all homologs from organisms that contain at least one neighborhood defined by max_gap.
def return_valid_organism_homologs(hlog_list, max_gap):
org_dict = {}
# Stage 1: read the list of homologs in, and organize based on accession. Each accession will have a list of homologs for a given gene block.
# Prior to this, the program does not sort the output.
# This section has been tested and validated to the best of my abilities.
#print len(hlog_list)
for item in hlog_list:
accession = item.accession()
#print accession
if accession in org_dict.keys():
org_dict[accession].append(item)
else:
org_dict.update({accession:[item]})
# Stage 2: Sort the list of homologs for each organism. Determine gene blocks based on the max_gap criterion,
# and reject organisms without a gene block.
# This section has been tested, but not extensively. I have added resolve_multiple_ORF_hits which is untested.
for accession in sorted(org_dict.keys()):
h_list = org_dict.pop(accession)
h_list.sort(key=lambda x: x.start())
# Here is where the code dealing explicitly with multiple hits to a single ORF goes:
# currently, we only use best hit. Other resolution schemes can be envisioned.
ORF_filetered_hlist = resolve_multiple_ORF_hits(h_list)
org_dict.update({accession:ORF_filetered_hlist})
# Stage 3: Organize the homologs into neighborhoods. We remove any organisms that lack neighboring genes.
# The return from this stage is a list of lists. Where each sub-list is a gene block, as defined by max_gap.
# This version is not completely tested, but appears to be working when tested against known cases.
neighborhood_dict = {}
for accession in sorted(org_dict.keys()):
hlist = org_dict.pop(accession)
gene_block_list, neighborhood_found = group_homologs(hlist, max_gap)
geneblock =''
if accession =="NC_002516" or accession =="NC_004463":
for blocks in hlist:
if "cai" in blocks.blast_annotation():
geneblock += blocks.blast_annotation()+ ',' + str(blocks.query_start())+ ',' + str(blocks.query_stop()) +'\t'
if len(geneblock) != 0:
print accession
print geneblock
if neighborhood_found:
neighborhood_dict.update({accession:gene_block_list})
org_dict.update({accession:hlist})
# Enable the print organism bit to see what we filter out initially...
else: # do nothing, there are no neighborhoods that have been recovered
#print "accession", accession, "is missing."
#print "Organism ", hlist[0].organism(), "is missing."
#print hlist
pass
# An explanation on what each of these returned dictionaries contains:
# Each has organisms that only contain gene neighborhoods.
# neighborhood_dict: accession keyed dict whose data is a list of neighborhods. organism1:[[h1, h2], [h3, h4], [h5]]
# org_dict: accession keyed dict whose data is a list of ungrouped homologs. organism1:[h1, h2, h3, h4, h5]
# org_dict differs from the inpupt (besides being a dict and not a list) by
return neighborhood_dict, org_dict
# I think this version is more readable than those i have made in the past.
# It can take either a sorted, or unsorted list of homologs.
def group_homologs(lst_homologs, max_gap):
# bypassing pass by reference in python that leads to potentially undesirable behavior downstream
list_homologs = [i for i in lst_homologs]
# Step 1: Sort the input list by the start position of the ORF
list_homologs.sort(key=lambda x: x.start())
# Step 2: Group homologs into gene blocks as defined my max_gap, and report these groups.
result, neighborhood_found = homolog_list_grouping_function(list_homologs, max_gap)
return result, neighborhood_found
# This function will take a list of ordered homologs, which have had their redundant BLAST thits removed, and group them by a max_gap constraint.
# The return is a list of lists. Single genes and gene blocks will both be represented as groups.
def homolog_list_grouping_function(list_homologs, max_gap):
result = []
neighborhood = [list_homologs[0]]
neighborhood_found = False
for i in list_homologs[1:]:
#look at current
start = neighborhood[-1].start() #start = list_homologs[i].start()
stop = neighborhood[-1].stop() #stop = list_homologs[i].stop()
# look at next
start_n = i.start() #start_n = list_homologs[i+1].start()
stop_n = i.stop() #stop_n = list_homologs[i+1].stop()
# We have found neighboring genes, as defined by max_gap
if math.fabs(start - stop_n) < max_gap or math.fabs(stop - start_n) < max_gap:
neighborhood_found = True
neighborhood.append(i)
# These genes do not neighbor eachother
else:
result.append(neighborhood)
neighborhood = [i]
result.append(neighborhood)
#print list_homologs[0].organism(), "result", result, "neighborhood_found ", neighborhood_found
return result, neighborhood_found
# fast implementation of an order preserving make unique function
def make_unique(lst, function):#lambda a: a.start):
seen = {}
result = []
for item in lst:
marker = function(item)
if marker not in seen:
seen.update({marker:1})
result.append(item)
return result
# This function will take the grouped lists, and determine which gene block genes do not occur as part of a neighborhood.
# These singletons genes will further be filtered for the best e-val as reported by BLAST. (Though other filtering schemes could be developed later)
def return_best_singleton_genes(grouped_lists):
# the result will be a list of lists.
result = []
# Step 1: determine all recovered genes (for a given gene block) in this organism
# return a list of evey homolog found by BLAST as a single list
organism_genes = list(itertools.chain(*grouped_lists))
# make list above unique based on the blast_annotation
unique_in_organism_by_annotation = make_unique(organism_genes, lambda x: x.blast_annotation())
# resulting in a list of annotations that are unique to the organism
organism_annotation_list = [i.blast_annotation() for i in unique_in_organism_by_annotation]
#print "organism_annotation_list", organism_annotation_list
# Step 2: determine genes that are found in neighborhoods as defined by the max_gap criterion
# find genes that are grouped
neighborhoods_only = [i for i in grouped_lists if len(i) > 1]
# make into a single list of homologs
neighborhood_hlog_list = list(itertools.chain(*neighborhoods_only))
# make list above unique based on the blast_annotation
unique_neighborhood_by_annotation = make_unique(neighborhood_hlog_list, lambda x: x.blast_annotation())
# resulting in a list of annotations that are unique to neighboring genes
neighborhood_annotation_list = [i.blast_annotation() for i in unique_neighborhood_by_annotation]
#print "neighborhood_annotation_list", neighborhood_annotation_list
# Step 3: return a list of singletons genes, that are only found as singletons in the current genome
single_annotation_list = list(set(organism_annotation_list) - set(neighborhood_annotation_list))
singleton_genes = [i for i in grouped_lists if len(i) == 1]
filtered_singlgeton_genes = [i for i in singleton_genes if i[0].blast_annotation() in single_annotation_list]
# Step 4: Find the singleton gene that has the most significant e-value per BLAST annotation
best_singleton_dict = {}
for tmp in filtered_singlgeton_genes:
gene = tmp[0]
# a singleton gene with this annotation has already been found, use e-val to determine wich is the more significant hit
if gene.blast_annotation() in best_singleton_dict.keys():
old_gene = best_singleton_dict.pop(gene.blast_annotation())
if old_gene.e_val() <= gene.e_val(): # the existing homolog is a more significant hit
best_singleton_dict.update({gene.blast_annotation(): old_gene})
else: # the new homolog is a more significant hit
best_singleton_dict.update({gene.blast_annotation(): gene})
else: # we have not seen this annotation yet, store it in the dictionary
best_singleton_dict.update({gene.blast_annotation(): gene})
# Step 5: return a list of lists, [[s1], [s2], [s3]] for each entry in the best_singleton_dict.
for i in best_singleton_dict.keys():
result.append([best_singleton_dict[i]])
return result
# This version is not rigorously tested, but seems to work correctly.
# Return a list of lists of homologs = [[],[]], number of splits, and number of duplications.
# unique_genes_in_organism, len_gene_block are integers grouped_list is a list of lists, and only contains groups 2 or more.
def optimize_neighborhoods(grouped_lists):
# Step 1: make the grouped lists into a single list of homologs
org_hlog_list = list(itertools.chain(*grouped_lists))
neighborhoods_only = [i for i in grouped_lists if len(i) > 1]
grouped_hlog_list = list(itertools.chain(*neighborhoods_only))
# Step 2: determine the number of unique genes in both neighborhoods, and the organism.
# To better explain: I need to know the number of unique genes the organism contains for the gene block.
# I also need to know the number of unique genes found in neighborhoods.
number_unique_genes_in_organism = len(make_unique(org_hlog_list, lambda x: x.blast_annotation()))
number_unique_in_neighborhoods = len(make_unique(grouped_hlog_list, lambda x: x.blast_annotation()))
'''
# Debugging. This does check out. kinda interesting stuff though here, there are some inline tandem repeats where gene name is different.
# Everything looks like it works correctly though
print org_hlog_list[0].organism(), "number_unique_genes_in_organism", number_unique_genes_in_organism, "number_unique_in_neighborhoods", number_unique_in_neighborhoods
if number_unique_in_neighborhoods == 1:
print "neighborhoods_only", neighborhoods_only
for group in neighborhoods_only:
for gene in group:
print gene.blast_annotation(), gene.genbank_annotation(), gene.locus(), gene.start(), gene.stop()
#print "grouped_lists",grouped_lists
'''
# Step 3: greedy algorithm to determine the best neighborhoods to report as a final result
optimal = False
num_in_list = 1 # this is the number of elements per list reurned
best_duplicates = 0
splits = number_unique_genes_in_organism - number_unique_in_neighborhoods
while not optimal:
for group in itertools.combinations(grouped_lists, num_in_list):
#all_homologs_in_grouping = [item for sublist in group for item in sublist]
all_homologs_in_grouping = list(itertools.chain(*group))
#print all_homologs_in_grouping
#unique_in_set = len(MakeUnique(all_homologs_in_grouping, lambda a: a.predicted_gene))
unique_in_set = len(make_unique(all_homologs_in_grouping, lambda x: x.blast_annotation()))
#if unique_in_set == len_unique_grouped: # we have an optimal solution, perhaps not global optima
if unique_in_set == number_unique_in_neighborhoods: # we have an optimal solution, perhaps not global optima
duplicates = int(math.fabs(len(all_homologs_in_grouping) - number_unique_in_neighborhoods))
if not optimal:
optimal = True
best_grouping = list(group)
best_duplicates = duplicates
best_split = splits
elif duplicates < best_duplicates:
best_grouping = list(group)
best_duplicates = duplicates
splits+=1
num_in_list+=1
#print "splits " , splits, ": best_split ", best_split
#print "Best grouping as found by the program\n", best_grouping
# Step 4: determine the best (if necessary) singleton genes to complete
if number_unique_genes_in_organism != number_unique_in_neighborhoods:
# This step takes time, so only perform it when you have to
best_singletons = return_best_singleton_genes(grouped_lists)
#print "Difference", number_unique_genes_in_organism - number_unique_in_neighborhoods, len(best_singletons)
#print "singletons", best_singletons , ' '.join([i.blast_annotation() for i in list(itertools.chain(*best_singletons))])
best_grouping = best_grouping + best_singletons
return best_grouping, best_split, best_duplicates#, len_unique_grouped
# does not seem to be in use
'''
def parallel_filter_operons(arg_tuple):
fname, outfolder, max_gap, e_val = arg_tuple
'''
def main():
start = time.time()
parsed_args = parser_code()
infolder, outfolder, filter_file, num_proc, e_val, max_gap, quiet = check_options(parsed_args)
if not quiet:
print infolder, outfolder, filter_file, num_proc, e_val, max_gap
file_list = return_file_list(infolder, filter_file)
parallel_list_param = [(i, outfolder, max_gap, e_val) for i in file_list]
for fname in file_list:
#print fname
hlog_list = filter_eval(fname, e_val)
# here we return two dictionaries that are keyed by org, and contain at least one gene block. Orgs without gene blocks are omitted.
neighborhood_dict, org_dict = return_valid_organism_homologs(hlog_list, max_gap)
'''if "caiTABCDE" in fname:
for accession in neighborhood_dict:
if accession =="NC_002516" or accession =="NC_004463":
print accession
for blocks in neighborhood_dict[accession]:
geneblock =''
for homolog in blocks:
geneblock += homolog.blast_annotation()+ ',' + str(homolog.query_start())+ ',' + str(homolog.query_stop()) +'\t'
print geneblock '''
# open a file handle to the output
head, tail = os.path.split(fname)
outfile = outfolder + tail
handle = open(outfile, 'w')
# Save the result, in the result folder, just like the good little program you are.
for org in sorted(neighborhood_dict.keys()):
best_grouping, best_split, best_duplicates = optimize_neighborhoods(neighborhood_dict[org])
grouping_list = sorted(list(itertools.chain(*best_grouping)), key=lambda x: x.start())
handle.write('\n'.join([i.to_file() for i in grouping_list])+'\n')
handle.close()
if not quiet:
print time.time() - start
# ./filter_operon_blast_results.py -f phylo_order.txt
if __name__ == '__main__':
main()