-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_a_table.py
executable file
·449 lines (390 loc) · 20.4 KB
/
make_a_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
#!/usr/bin/env python2
import collections
import gzip
import operator
import optparse
import os
import re
import sys
import time
import pybam
# Define a wrapper around quicksect.IntervalTree or intervaltree.IntervalTree, depending
# on which one is available, so that the rest of the script can work with both.
# quicksect is much faster than intervaltree, but harder to install.
# Throughout the script, start and end coordinates are all 1-based and both included
try:
import quicksect
class myintervaltree(quicksect.IntervalTree):
def __init__(self):
super(myintervaltree, self).__init__()
self.interval_to_data = {}
self.has_overlap = self.search
# quicksect cannot associate intervals to data, so we need to it ourselves
def add_data(self, start, end, data):
interval = quicksect.Interval(start, end)
self.insert( interval )
self.interval_to_data[ interval ] = data
def data_overlapping(self, start, end):
return [self.interval_to_data[i] for i in self.search(start, end)]
except ImportError:
import intervaltree
class myintervaltree(intervaltree.IntervalTree):
def __init__(self):
super(myintervaltree, self).__init__()
# Deal with versions 2.* and 3.*
if hasattr(intervaltree.IntervalTree, "search"):
self.search_func = self.search
else:
self.search_func = self.overlap
self.has_overlap = self.search_func
# In intervaltree.IntervalTree the intervals include the lower bound but not the upper bound.
def add_data(self, start, end, data):
self.addi(start, end+1, data)
def has_overlap(self, start, end):
return self.search_func(start, end+1)
def data_overlapping(self, start, end):
return [i.data for i in self.search_func(start, end+1)]
# Simple structure for a read alignment
ReadAlignment = collections.namedtuple('ReadAlignment', ['chrom_name', 'start_pos', 'cigar_list', 'NH_value', 'NM_value'])
# Simple structure for a generic, named, data
NamedEntry = collections.namedtuple('NamedEntry', ['name', 'data'])
parser = optparse.OptionParser(usage = "usage: %prog [options] gtf_file bam_file_1 bam_file_2 ...")
parser.add_option("-r", "--rename_gene", nargs = 2, action = 'append', dest = 'gene_renames', metavar = 'NAME_TO_REPLACE NEW_NAME',
help = 'Replace some erroneous gene names')
parser.add_option("-g", "--gene_types", dest = 'gene_types', default = 'protein_coding',
help = 'Comma-separated list of gene biotypes to use [default: %default]. Use an empty string for no filtering')
parser.add_option("-t", "--transcript_types", dest = 'transcript_types', default = 'protein_coding',
help = 'Comma-separated list of transcript biotypes to use for the exon-overlap filtering [default: %default]. Use an empty string for no filtering')
parser.add_option("-n", "--no_gtf_filter", action = 'store_true',
help = 'Do not use a GTF file to filter the reads. The command line arguments are then expected to all be BAM files.')
parser.add_option("-s", "--strict", action = 'store_true', default = False,
help = 'Use to select the strict mode, which discards paired reads when one read alf is not in an exon')
parser.add_option("-l", "--log", nargs = 1, dest = 'log_file', action = 'store',
help = "Do not include paired reads when one read is not in an exon")
(options, args) = parser.parse_args()
if options.no_gtf_filter:
parser.usage = parser.usage.replace('gtf_file ', '')
gtf_file = None
else:
if len(args) == 0:
parser.error("No GTF/BAM files given")
gtf_file = args.pop(0)
if len(args) == 0:
parser.error("No BAM files given")
bam_files = args
print >> sys.stderr, "BAM+GTF merger for AltHapAlign called with these options"
print >> sys.stderr, "\tgtf_file:", gtf_file
print >> sys.stderr, "\tbam_files:", bam_files
print >> sys.stderr, "\toptions:", options
print >> sys.stderr
class StopWatch:
def __init__(self):
self.ref_time = time.time()
def reset(self):
old_time = self.ref_time
self.ref_time = time.time()
return self.ref_time-old_time
main_stopwatch = StopWatch()
def load_genes(gtf_file):
if not options.no_gtf_filter:
print >> sys.stderr, "Loading the GTF file ... ",
gene_type_filter = re.compile('gene_type "?(%s)"?;' % options.gene_types.replace(",", "|")) if options.gene_types else None
transcript_type_filter = re.compile('transcript_type "?(%s)"?;' % options.transcript_types.replace(",", "|")) if options.transcript_types else None
gene_renames = dict(options.gene_renames) if options.gene_renames else {}
gene_names = collections.defaultdict(myintervaltree)
n_genes = set()
n_exons = 0
if options.no_gtf_filter:
gtf_file = os.devnull
with gzip.GzipFile(gtf_file, 'r') if gtf_file.endswith('.gz') else open(gtf_file, 'r') as f:
for line in f:
t = line.strip().split("\t")
if gene_type_filter and not gene_type_filter.search(t[8]):
continue
if transcript_type_filter and not transcript_type_filter.search(t[8]):
continue
if t[2] == "exon":
if 'gene_name' not in t[8]:
print >> sys.stderr, "Failed\nERROR: no 'gene_name' attribute for this exon:", line,
sys.exit(1)
i1 = t[8].find('gene_name')
i2 = t[8].find(';', i1)
gn = t[8][i1+9:i2].strip().strip('"')
name = gene_renames.get(gn, gn)
start = int(t[3])
end = int(t[4])
gene_names[t[0]].add_data(start, end, name)
n_genes.add(name)
n_exons += 1
if not options.no_gtf_filter:
print >> sys.stderr, "Done (%.2f seconds): %d genes and %d exons" % (main_stopwatch.reset(), len(n_genes), n_exons)
return gene_names
gene_names = load_genes(gtf_file)
# Input: BAM read with tag list, and tag name
# Output: the value of the tag
# Exception: exit if the tag is not present, or in multiple copies
def get_tag_value(read, tag):
values = [t[2] for t in read[-1] if t[0].upper() == tag]
if not values:
print >> sys.stderr, "ERROR: Could not find any %s tag in" % tag, read
sys.exit(1)
if len(values) > 1:
print >> sys.stderr, "ERROR: More than 1 %s tag in" % tag, read
sys.exit(1)
return values[0]
# Input: CIGAR alignment already parsed to a list of (number, character)
# Output: integer
# Description: Computes the length of the alignment on the genome
def mapping_length(cigar_list):
s = 0
for (n, c) in cigar_list:
if c == 'I':
s -= n
else:
s += n
return s
# Holds all the BAM processing functions (generators) and binds them together
class BAMProcessor:
def __init__(self, bam_file):
self.bam_parser = pybam.read(bam_file, ['sam_qname', 'sam_rname', 'sam_pos1', 'sam_cigar_list', 'sam_tags_list'])
self.n_bam_aligns = 0
self.n_singletons = 0
self.n_paired_alignments = 0
self.n_multiple_hits = 0
self.n_non_exonic_bam_aligns = 0
self.n_different_genes_pair = 0
self.n_partially_exonic = 0
self.n_fully_exonic = 0
self.n_gene_match = 0
# Input: BAM iterator with tag list
# Output: BAM iterator with selected tag values (NH and NM)
# Description: Extract the NH and NM values from the BAM alignments
def extract_tags(self):
for read in self.bam_parser:
self.n_bam_aligns += 1
NH_value = get_tag_value(read, 'NH')
NM_value = get_tag_value(read, 'NM')
yield NamedEntry(read[0], ReadAlignment(read[1], read[2], read[3], NH_value, NM_value))
# Input: iterator (read_name, ...data...)
# Output: iterator (read_name, [(...data1...), (...data2...)])
# Description: Group consecutive pairs of reads that have the same name.
# This assumes that the input BAM file is sorted.
def group_read_alignments(self):
last_reads = []
last_read_name = None
for read in self.extract_tags():
this_read_name = read.name
if last_read_name:
if last_read_name != this_read_name:
yield NamedEntry(last_read_name, last_reads)
last_reads = []
last_read_name = this_read_name
else:
last_read_name = this_read_name
last_reads.append(read.data)
if last_read_name:
yield NamedEntry(last_read_name, last_reads)
# Input: iterator over paired alignments
# Output: iterator over a subset of the input
# Description: Discards the alignments that are not uniquely mapped pairs,
# i.e. are singletons or have multiple hits / don't have NH:i:1.
def select_paired_alignments(self):
for aligned_read in self.group_read_alignments():
n_alignments = len(aligned_read.data)
NH_values = [alignment.NH_value for alignment in aligned_read.data]
if n_alignments == 1:
self.n_singletons += 1
elif n_alignments == 2:
# Check that NH = 1 for both
if NH_values[0] != 1:
assert NH_values[1] != 1
# Could be classified as singletons too
self.n_multiple_hits += 2
else:
assert NH_values[1] == 1
self.n_paired_alignments += 1
yield aligned_read
else:
for NH_value in NH_values:
assert NH_value != 1
self.n_multiple_hits += n_alignments
# Input: iterator over paired alignments
# Output: iterator that has gene names too, but on a subset of the alignments
# Description: Discards the reads that do not overlap any exons
def only_exonic_mappings(self):
for aligned_read in self.select_paired_alignments():
ok = 0
genes_seen = set()
for aln in aligned_read.data:
if aln.chrom_name in gene_names:
end_pos = aln.start_pos + mapping_length(aln.cigar_list) - 1
names_here = gene_names[aln.chrom_name].data_overlapping(aln.start_pos, end_pos)
if names_here:
genes_seen.update(names_here)
ok += 1
if ok == 0:
self.n_non_exonic_bam_aligns += 1
elif len(genes_seen) > 1:
self.n_different_genes_pair += 1
else:
if ok == 1:
self.n_partially_exonic += 1
if options.strict:
continue
else:
self.n_fully_exonic += 1
yield NamedEntry(aligned_read.name, NamedEntry(genes_seen.pop(), aligned_read.data))
class BAMMergedProcessor:
def __init__(self, bam_processors, generator_name):
self.bam_processors = bam_processors
self.bam_iterators = map(operator.methodcaller(generator_name), bam_processors)
self.n_groups = 0
self.n_unique_groups = 0
self.n_best_groups = 0
self.n_ambiguous_groups = 0
def stat_sum(self, attr_name):
return sum(map(operator.attrgetter(attr_name), self.bam_processors))
# Input: list of iterators (read_name, data)
# Output: iterator (read_name, [data1 | None, data2 | None, ...])
# Description: For each read name, groups the reads from each BAM file,
# assuming that the files are sorted by read name
# Remarks: This is similar to a k-way merge algorithm. Although the merge
# itself can be achieved in O(log(k)) in theory, my implementations
# did not provide any improvements, especially because the output
# of the function is O(k). So in the end I'm merging the streams in O(k)
def merged_iterators(self):
EMPTY_ITERATOR = NamedEntry(None, None)
current_reads = []
active_bam_parsers = 0
for parser in self.bam_iterators:
try:
current_reads.append( parser.next() )
active_bam_parsers += 1
except StopIteration:
current_reads.append( EMPTY_ITERATOR )
last_read_name = None
while active_bam_parsers:
read_names = [cr.name for cr in current_reads if cr is not EMPTY_ITERATOR]
next_read_name = min(read_names)
if last_read_name and (next_read_name < last_read_name):
not_sorted = []
for (i, cr) in enumerate(current_reads):
if cr.name == next_read_name:
not_sorted.append(bam_files[i])
raise AssertionError("The BAM files %s are not sorted: read '%s' is after '%s'" % (", ".join(not_sorted), next_read_name, last_read_name))
last_read_name = next_read_name
self.n_groups += 1
yield NamedEntry(next_read_name, [cr.data if cr.name == next_read_name else None for cr in current_reads])
for (i, cr) in enumerate(current_reads):
if cr.name == next_read_name:
try:
current_reads[i] = self.bam_iterators[i].next()
except StopIteration:
current_reads[i] = EMPTY_ITERATOR
active_bam_parsers -= 1
# Input: iterator (read_name, [data1 | None, data2 | None, ...])
# Output: iterator (read_name, [data1 | None, data2 | None, ...])
# Description: For each group of reads, finds the gene names on the genome
# (using the alignment) and only keeps the groups that map to
# a single gene name.
def select_same_gene(self):
for read_group in self.merged_iterators():
genes_seen = {}
for pair in read_group.data:
if pair is not None:
gene_name = pair.name
NM_score = sum(alignment.NM_value for alignment in pair.data)
if gene_name in genes_seen:
if NM_score < genes_seen[gene_name]:
genes_seen[gene_name] = NM_score
else:
genes_seen[gene_name] = NM_score
if len(genes_seen) == 1:
status = 'unique'
best_genes = genes_seen.keys()
self.n_unique_groups += 1
else:
best_NM = min(genes_seen.values())
best_genes = [gene_name for (gene_name, NM_score) in genes_seen.items() if NM_score == best_NM]
if len(best_genes) == 1:
self.n_best_groups += 1
status = 'best'
else:
self.n_ambiguous_groups += 1
status = 'ambiguous'
for gene_name in best_genes:
for (bp, pair) in zip(self.bam_processors, read_group.data):
if pair and pair.name == gene_name:
bp.n_gene_match += 1
yield NamedEntry(read_group.name, (status, gene_name, [pair.data if pair and pair.name == gene_name else None for pair in read_group.data]))
def toString(entry):
line = [entry.name]
if isinstance(entry.data, tuple):
# gene name
line.extend(entry.data[:-1])
alns = entry.data[-1]
else:
alns = entry.data
for paired_data in alns:
if paired_data is None:
line.append('NA')
else:
line.append(str(paired_data[0].NM_value + paired_data[1].NM_value))
return "\t".join(line)
print >> sys.stderr, "Opening the BAM files ...",
bam_processors = map(BAMProcessor, bam_files)
print >> sys.stderr, " Done (%.2f seconds)" % main_stopwatch.reset()
last_n_bam_aligns = 0
print >> sys.stderr, "Reading the BAM files ..."
headers = ["read_name"]
if options.no_gtf_filter:
merged_processor = BAMMergedProcessor(bam_processors, 'select_paired_alignments')
it = merged_processor.merged_iterators()
else:
headers = headers + ["gene_name_confidence", "gene_name"]
merged_processor = BAMMergedProcessor(bam_processors, 'only_exonic_mappings')
it = merged_processor.select_same_gene()
headers = headers + bam_files
print "\t".join(headers)
last_n_groups = 0
n_lines = 0
bam_reading_stopwatch = StopWatch()
for entry in it:
print toString(entry)
n_lines += 1
if merged_processor.n_groups >= last_n_groups+10000:
n_bam_aligns = merged_processor.stat_sum('n_bam_aligns')
print >> sys.stderr, "Found %d reads across all BAM files (%d alignments processed -- %.2f per second)" % (merged_processor.n_groups, n_bam_aligns, (n_bam_aligns-last_n_bam_aligns)/bam_reading_stopwatch.reset())
last_n_bam_aligns = n_bam_aligns
last_n_groups = merged_processor.n_groups
BAMProcessor.n_paired_alignments_after_filter = property(lambda self: self.n_paired_alignments-self.n_non_exonic_bam_aligns-self.n_different_genes_pair-(self.n_partially_exonic if options.strict else 0))
attr_names = ['n_bam_aligns', 'n_singletons', 'n_multiple_hits', 'n_paired_alignments', 'n_non_exonic_bam_aligns', 'n_different_genes_pair', 'n_partially_exonic', 'n_fully_exonic', 'n_paired_alignments_after_filter', 'n_gene_match']
for stat_name in attr_names:
setattr(merged_processor, stat_name, merged_processor.stat_sum(stat_name))
print >> sys.stderr, "Finished reading the BAM files in %.2f seconds" % main_stopwatch.reset()
print >> sys.stderr, "%d alignments in total across all %d BAM files" % (merged_processor.n_bam_aligns, len(bam_files))
print >> sys.stderr, "\t%d discarded (%.2f%%) - singletons " % (merged_processor.n_singletons, 100.*merged_processor.n_singletons/merged_processor.n_bam_aligns)
print >> sys.stderr, "\t%d discarded (%.2f%%) - multiple hits (NH != 1)" % (merged_processor.n_multiple_hits, 100.*merged_processor.n_multiple_hits/merged_processor.n_bam_aligns)
print >> sys.stderr, "%d paired alignments in total" % merged_processor.n_paired_alignments
if not options.no_gtf_filter:
print >> sys.stderr, "\t%d discarded (%.2f%%) - both not exonic" % (merged_processor.n_non_exonic_bam_aligns, 100.*merged_processor.n_non_exonic_bam_aligns/merged_processor.n_paired_alignments)
print >> sys.stderr, "\t%d discarded (%.2f%%) - multiple genes hit" % (merged_processor.n_different_genes_pair, 100.*merged_processor.n_different_genes_pair/merged_processor.n_paired_alignments)
print >> sys.stderr, "\t%d %s (%.2f%%) - only one not exonic" % (merged_processor.n_partially_exonic, 'discarded' if options.strict else 'kept', 100.*merged_processor.n_partially_exonic/merged_processor.n_paired_alignments)
print >> sys.stderr, "\t%d kept (%.2f%%) - both exonic in same gene" % (merged_processor.n_fully_exonic, 100.*merged_processor.n_fully_exonic/merged_processor.n_paired_alignments)
print >> sys.stderr, "%d paired alignments in total after GTF filtering" % merged_processor.n_paired_alignments_after_filter
print >> sys.stderr, "%d unique read names" % merged_processor.n_groups
if merged_processor.n_groups and (not options.no_gtf_filter):
print >> sys.stderr, "Gene name assignment statistics"
print >> sys.stderr, "\t%d reads (%.2f%%): single candidate" % (merged_processor.n_unique_groups, 100.*merged_processor.n_unique_groups/merged_processor.n_groups)
if merged_processor.n_best_groups:
print >> sys.stderr, "\t%d reads (%.2f%%): multiple candidates, lowest NM score selected" % (merged_processor.n_best_groups, 100.*merged_processor.n_best_groups/merged_processor.n_groups)
if merged_processor.n_ambiguous_groups:
print >> sys.stderr, "\t%d reads (%.2f%%): multiple candidates, tie - %d candidates listed (%.2f per read on average)" % (merged_processor.n_ambiguous_groups, 100.*merged_processor.n_ambiguous_groups/merged_processor.n_groups, n_lines-merged_processor.n_unique_groups-merged_processor.n_best_groups, float(n_lines-merged_processor.n_unique_groups-merged_processor.n_best_groups)/merged_processor.n_ambiguous_groups)
if options.log_file:
with open(options.log_file, 'w') as fh:
print >> fh, '\t'.join(['# filename'] + attr_names)
for (filename, bam_processor) in zip(bam_files, bam_processors):
print >> fh, '\t'.join([filename] + [str(getattr(bam_processor, attr_name)) for attr_name in attr_names])
#Return a non-zero code if we couldn't find any groups
if not merged_processor.n_groups:
sys.exit(1)