-
Notifications
You must be signed in to change notification settings - Fork 2
/
alfa.py
executable file
·1724 lines (1570 loc) · 92.4 KB
/
alfa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = "bahin & noel"
""" ALFA provides a global overview of features distribution composing NGS dataset(s). """
import os
import sys
import re
import numpy as np
import collections
import copy
import argparse
import pysam
import pybedtools
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from multiprocessing import Pool
import progressbar
# To correctly embed the texts when saving plots in svg format
matplotlib.rcParams["svg.fonttype"] = "none"
##########################################################################
# FUNCTIONS #
##########################################################################
def init_dict(d, key, init):
if key not in d:
d[key] = init
def tryint(s):
""" Function called by "alphanum_key" function to sort the chromosome names. """
try:
return int(s)
except ValueError:
return s
def alphanum_key(s):
""" Turn a string into a list of string and number chunks.
"z23a" -> ["z", 23, "a"]
"""
return [tryint(c) for c in re.split("([0-9]+)", s)]
def existing_file(filename):
""" Checks if filename already exists and exit if so. """
if os.path.isfile(filename):
sys.exit("Error: The file '" + filename + "' is about to be produced but already exists in the directory. \n### End of program")
def get_chromosome_names_in_GTF(options):
""" Function to get the list of chromosome names present in the provided GTF file. """
chr_list = []
with open(options.annotation, "r") as GTF_file:
for line in GTF_file:
if not line.startswith("#"):
chr = line.split("\t")[0]
if chr not in chr_list:
chr_list.append(chr)
return sorted(chr_list)
def get_chromosome_names_in_index(genome_index, index_chrom_list):
""" Returns the chromosome names in a genome index file. """
with open(genome_index, "r") as index_fh:
for line in index_fh:
if not line.startswith("#") and (line.split("\t")[0] not in index_chrom_list):
index_chrom_list.append(line.split("\t")[0])
return index_chrom_list
def GTF_splitter(GTF_file, output_dir, chunk_basename, size=10000):
""" Function to split a GTF file into chunks of one chromosome or several chromosomes/scaffolds up to N (default=10k) lines. """
if os.path.isfile(output_dir + chunk_basename + "1.gtf"):
sys.exit("Error: There is already a file called '" + chunk_basename + "1.gtf' in the directory. Running the command would crush this file. Aborting")
prev_chr = "" # Chr/scaffold previously processed
prev_cpt = 0 # Currently building chunk file line counter
cpt = 0 # Processed chr/scaffold line counter
cpt_chunk = 1 # Chunk counter
current_file = open(output_dir + "current.gtf", "w") # New piece to add to the building chunk file (one chromosome/scaffold)
# Processing the input GTF file
with open(GTF_file, "r") as input_file:
for line in input_file:
# Burning header lines
if line.startswith("#"):
continue
# Getting the chromosome/scaffold
chr = line.split("\t")[0] # Processed chr/scaffold
# Reaching another chromosome/scaffold
if (chr != prev_chr) and (prev_chr != ""):
if cpt > size:
# Packing up the processed chr/scaffold
current_file.close()
os.rename(output_dir + "current.gtf", output_dir + chunk_basename + str(cpt_chunk) + ".gtf")
current_file = open(output_dir + "current.gtf", "w")
# Updating counters
cpt_chunk += 1
else:
if cpt + prev_cpt > size:
# Packing up the currently building chunk file without the last chr/scaffold
os.rename(output_dir + "old.gtf", output_dir + chunk_basename + str(cpt_chunk) + ".gtf")
# Updating counters
cpt_chunk += 1
prev_cpt = 0
# Moving the new piece to the currently building chunk file
current_file.close()
with open(output_dir + "current.gtf", "r") as input_file, open(output_dir + "old.gtf", "a") as output_file:
for line in input_file:
output_file.write(line)
current_file = open(output_dir + "current.gtf", "w")
# Updating counters
prev_cpt += cpt
# Updating the processed chr/scaffold line counter and the previous chromosome
cpt = 0
prev_chr = chr
current_file.write(line)
else: # First content line or another line for the processed chr/scaffold
current_file.write(line)
prev_chr = chr
cpt += 1
# Processing the last chromosome/scaffold
current_file.close()
if prev_cpt == 0: # There was only one chromosome/scaffold in the annotation file
# Packing up the processed chr/scaffold
os.rename(output_dir + "current.gtf", output_dir + chunk_basename + str(cpt_chunk) + ".gtf")
else:
if prev_cpt + cpt > size:
# Packing up the processed chr/scaffold
os.rename(output_dir + "current.gtf", output_dir + chunk_basename + str(cpt_chunk) + ".gtf")
cpt_chunk += 1
else:
# Moving the new piece to the currently building chunk file
with open(output_dir + "current.gtf", "r") as input_file, open(output_dir + "old.gtf", "a") as output_file:
for line in input_file:
output_file.write(line)
os.remove(output_dir + "current.gtf")
# Packing up the currently building chunk file without the last chr/scaffold
os.rename(output_dir + "old.gtf", output_dir + chunk_basename + str(cpt_chunk) + ".gtf")
def get_chromosome_lengths(options):
"""
Parse the file containing the chromosome lengths.
If no length file is provided, browse the annotation file (GTF) to estimate the chromosome sizes.
"""
lengths = {}
gtf_chrom_names = set()
# If the user provided a chromosome length file
if options.chr_len:
# Getting the chromosome lengths from the chromosome lengths file
with open(options.chr_len, "r") as chr_len_fh:
for line in chr_len_fh:
try:
lengths[line.split("\t")[0]] = int(line.rstrip().split("\t")[1])
except IndexError:
sys.exit("Error: The chromosome lengths file is not correctly formed. It is supposed to be tabulated file with two fields per line.")
# Getting the chromosome lengths from the GTF file
with open(options.annotation, "r") as gtf_fh:
for line in gtf_fh:
if not line.startswith("#"):
gtf_chrom_names.add(line.split("\t")[0])
# Checking if the chromosomes from the chromosome lengths file are present in the GTF file
for chrom in lengths:
if chrom not in gtf_chrom_names:
print("Warning: chromosome '" + chrom + "' of the chromosome lengths file does not match any chromosome name in the GTF file provided and was ignored.", file=sys.stderr)
# Checking if the chromosomes from the GTF file are present in the lengths file
for chrom in gtf_chrom_names:
if chrom not in lengths:
print("Warning: at least one chromosome ('" + chrom + "') was found in the GTF file and does not match any chromosome provided in the lengths file.", file=sys.stderr)
print("\t=> All the chromosome lengths will be approximated using annotations in the GTF file.", file=sys.stderr) ## MB: better than the first end of line??
break
else:
return lengths
# If no chromosome lengths file was provided or if at least one chromosome was missing in the file, the end of the last annotation of the chromosome in the GTF file is considered as the chromosome length
with open(options.annotation, "r") as gtf_fh:
for line in gtf_fh:
if not line.startswith("#"):
chrom = line.split("\t")[0]
end = int(line.split("\t")[4])
init_dict(lengths, chrom, 0)
lengths[chrom] = max(lengths[chrom], end)
print("The chromosome lengths have been approximated using the GTF file annotations (the stop position of the last annotation of each chromosome is considered as its length).")
return lengths
def write_index_line(feat, chrom, start, stop, sign, fh):
""" Write a new line in an index file. """
# Formatting the features info
feat_by_biotype = []
for biot, cat in feat.items():
feat_by_biotype.append(":".join((str(biot), ",".join(sorted(cat)))))
# Writing the features info in the index file
fh.write("\t".join((chrom, start, stop, sign)) + "\t" + "\t".join(feat_by_biotype) + "\n")
def write_index(feat_values, chrom, start, stop, stranded_genome_index, unstranded_genome_index):
""" Writing the features info in the proper index files. """
# Writing info to the stranded indexes
if feat_values[0] != {}:
write_index_line(feat_values[0], chrom, start, stop, "+", stranded_genome_index)
else:
stranded_genome_index.write("\t".join((chrom, start, stop, "+", "antisense\n")))
if feat_values[1] != {}:
write_index_line(feat_values[1], chrom, start, stop, "-", stranded_genome_index)
else:
stranded_genome_index.write("\t".join((chrom, start, stop, "-", "antisense\n")))
# Writing info to the unstranded index
unstranded_feat = dict(feat_values[0], **feat_values[1])
for name in set(feat_values[0]) & set(feat_values[1]):
unstranded_feat[name] += feat_values[0][name]
write_index_line(unstranded_feat, chrom, start, stop, ".", unstranded_genome_index)
def register_interval(features_dict, chrom, stranded_index_fh, unstranded_index_fh):
""" Write the interval features info into the genome index files. """
# Writing the interval in the index file
with open(unstranded_index_fh, "a") as unstranded_index_fh, open(stranded_index_fh, "a") as stranded_index_fh:
# Initializing the first interval start and features
sorted_pos = sorted(features_dict["+"].keys())
interval_start = sorted_pos[0]
features_plus = features_dict["+"][interval_start]
features_minus = features_dict["-"][interval_start]
# Browsing the interval boundaries
for interval_stop in sorted_pos[1:]:
# Writing the current interval features to the indexes
write_index([features_plus, features_minus], chrom, str(interval_start), str(interval_stop), stranded_index_fh, unstranded_index_fh)
# Initializing the new interval start and features
interval_start = interval_stop
# Store current features
prev_features_plus = features_plus
prev_features_minus = features_minus
# Update features
features_plus = features_dict["+"][interval_start]
features_minus = features_dict["-"][interval_start]
# If feature == transcript and prev interval's feature is exon => add intron feature
for biotype, categ in features_plus.items():
if categ == ["gene", "transcript"]:
if "exon" in prev_features_plus[biotype] or "intron" in prev_features_plus[biotype]:
categ.append("intron")
else:
continue
for biotype, categ in features_minus.items():
if categ == ["gene", "transcript"]:
if "exon" in prev_features_minus[biotype]:
categ.append("intron")
else:
continue
def generate_genome_index_1chr(arguments):
# Getting the arguments
(annotation, chunk_basename, reverse_strand, options) = arguments
# Setting the annotation file basename
annotation_basename = re.sub(".gtf$", "", annotation)
# Processing the annotation file
with open(annotation, "r") as gtf_fh:
max_value = -1 # Maximum value of the currently processed interval
intervals_dict = {}
prev_chrom = ""
for line in gtf_fh:
# Processing lines except comment ones
if not line.startswith("#"):
# Getting the line info
line_split = line.rstrip().split("\t")
chrom = line_split[0]
cat = line_split[2]
start = int(line_split[3]) - 1
stop = int(line_split[4])
strand = line_split[6]
antisense = reverse_strand[strand]
biotype = line_split[8].split("gene_biotype")[1].split(";")[0].strip('" ')
# Registering stored features info in the genome index file(s) if the new line concerns a new chromosome or the new line concerns an annotation not overlapping previously recorded ones
if (start > max_value) or (chrom != prev_chrom):
# Write the previous features
if intervals_dict:
register_interval(intervals_dict, prev_chrom, annotation_basename + ".stranded.ALFA_index", annotation_basename + ".unstranded.ALFA_index")
if chrom != prev_chrom:
with open(options.output_dir + chunk_basename + "txt", "a") as input_file:
input_file.write(chrom + "\n")
prev_chrom = chrom
# (Re)Initializing the intervals info dict
intervals_dict = {strand: {start: {biotype: [cat]}, stop: {}}, antisense: {start: {}, stop: {}}}
max_value = stop
# Update the dictionary which represents intervals for every distinct annotation
else:
# Storing the intervals on the strand of the current feature
stranded_intervals = intervals_dict[strand]
added_info = False # Variable to know if the features info were already added
# Browsing the existing boundaries
for boundary in sorted(stranded_intervals):
# While the GTF line start is after the browsed boundary: keep the browsed boundary features info in case the GTF line start is before the next boundary
if boundary < start:
stored_feat_strand = copy.deepcopy(dict(stranded_intervals[boundary]))
stored_feat_antisense = copy.deepcopy(dict(intervals_dict[antisense][boundary]))
continue
# The GTF line start is already an existing boundary: store the existing features info (to manage after the GTF line stop) and update it with the GTF line features info
elif boundary == start:
stored_feat_strand = copy.deepcopy(dict(stranded_intervals[boundary]))
stored_feat_antisense = copy.deepcopy(dict(intervals_dict[antisense][boundary]))
# Adding the GTF line features info to the interval
try:
if cat not in stranded_intervals[boundary][biotype]:
stranded_intervals[boundary][biotype].append(cat)
except KeyError: # If the GTF line features info regards a new biotype
stranded_intervals[boundary][biotype] = [cat]
added_info = True # The features info were added
continue
# The browsed boundary is after the GTF line start: add the GTF line features info
elif boundary > start:
# Create a new boundary for the GTF line start if necessary (if it is between 2 existing boundaries, it was not created before)
if not added_info:
stranded_intervals[start] = copy.deepcopy(stored_feat_strand)
try:
if cat not in stranded_intervals[start][biotype]:
stranded_intervals[start][biotype].append(cat)
except KeyError:
stranded_intervals[start][biotype] = [cat]
intervals_dict[antisense][start] = copy.deepcopy(stored_feat_antisense)
added_info = True # The features info were added
# While the browsed boundary is before the GTF line stop: store the existing features info (to manage after the GTF line stop) and update it with the GTF line features info
if boundary < stop:
stored_feat_strand = copy.deepcopy(dict(stranded_intervals[boundary]))
stored_feat_antisense = copy.deepcopy(dict(intervals_dict[antisense][boundary]))
try:
if cat not in stranded_intervals[boundary][biotype]:
stranded_intervals[boundary][biotype].append(cat)
except KeyError:
stranded_intervals[boundary][biotype] = [cat]
# The GTF line stop is already exists, nothing more to do, the GTF line features info are integrated
elif boundary == stop:
break
# The browsed boundary is after the GTF line stop: create a new boundary for the GTF line stop (with the stored features info)
else: # boundary > stop
stranded_intervals[stop] = copy.deepcopy(stored_feat_strand)
intervals_dict[antisense][stop] = copy.deepcopy(stored_feat_antisense)
break # The GTF line features info are integrated
# If the GTF line stop is after the last boundary, extend the dictionary
if stop > max_value:
max_value = stop
stranded_intervals[stop] = {}
intervals_dict[antisense][stop] = {}
# Store the categories of the last chromosome
register_interval(intervals_dict, chrom, annotation_basename + ".stranded.ALFA_index", annotation_basename + ".unstranded.ALFA_index")
if chrom != prev_chrom:
with open(options.output_dir + chunk_basename + "txt", "a") as input_file:
input_file.write(chrom + "\n")
return None
def generate_genome_index(unstranded_genome_index, stranded_genome_index, chrom_sizes, chunk_basename, options, reverse_strand):
""" Create an index of the genome annotations and save it in a file. """
# Write the chromosome lengths as comment lines before the genome index
with open(unstranded_genome_index, "w") as unstranded_index_fh, open(stranded_genome_index, "w") as stranded_index_fh:
for key, value in list(chrom_sizes.items()):
unstranded_index_fh.write("#%s\t%s\n" % (key, value))
stranded_index_fh.write("#%s\t%s\n" % (key, value))
# Chunk file list creation
chunks = np.array([options.output_dir + f for f in os.listdir(options.output_dir) if f.startswith(chunk_basename) and f.endswith(".gtf")])
# Sorting the chunks by file size
file_sizes = np.array([os.stat(f).st_size for f in chunks])
chunks = chunks[file_sizes.argsort()]
# Progress bar to track the genome indexes creation
pbar = progressbar.ProgressBar(widgets=["Indexing the genome ", progressbar.Percentage(), " ", progressbar.Bar(), progressbar.Timer()], maxval=len(chunks)).start()
pool = Pool(options.nb_processors)
list(pbar(pool.imap_unordered(generate_genome_index_1chr, zip(chunks, [chunk_basename] * len(chunks), [reverse_strand] * len(chunks), [options] * len(chunks)))))
"""
# Non-parallel version for debugging
for f in chunks:
generate_genome_index_1chr(f)
"""
def merge_index_chunks(unstranded_genome_index, stranded_genome_index, chunk_basename, output_dir):
""" Merges the genome index chunks into a single file. """
for fh, strandness in zip([unstranded_genome_index, stranded_genome_index], ["unstranded", "stranded"]):
files = [output_dir + f for f in os.listdir(output_dir) if f.startswith(chunk_basename) and f.endswith("." + strandness + ".ALFA_index")]
with open(fh, "a") as output_file:
for file in sorted(files):
with open(file, "r") as input_file:
for line in input_file:
output_file.write(line)
def chunks_cleaner(chunk_basename, output_dir):
""" Cleans the chunks created to index the genome. """
for f in os.listdir(output_dir):
if f.startswith(chunk_basename):
os.remove(output_dir + f)
def count_genome_features(cpt, features, start, stop, biotype_prios, options, prios, unknown_cat, coverage=1):
""" Reads genome index and registers feature counts. """
# If no biotype priority: category with the highest priority for each found biotype has the same weight (1/n_biotypes)
if not biotype_prios:
nb_biot = len(features)
if options.keep_ambiguous and nb_biot != 1:
# Increment "ambiguous" counter if more than 1 biotype
try:
cpt[("ambiguous", "ambiguous")] += (int(stop) - int(start)) * coverage
except:
cpt[("ambiguous", "ambiguous")] = (int(stop) - int(start)) * coverage
else:
# For each categ(s)/biotype pairs
for feat in features:
cur_prio = 0
# Separate categorie(s) and biotype
try:
biot, cats = feat.split(":")
# Error if the feature is "antisense": update the "antisense/antisense" counts
except ValueError:
try:
cpt[("opposite_strand", "opposite_strand")] += (int(stop) - int(start)) * coverage / float(nb_biot)
except KeyError:
cpt[("opposite_strand", "opposite_strand")] = (int(stop) - int(start)) * coverage / float(nb_biot)
return None
# Browse the categories and get only the one(s) with highest priority
for cat in cats.split(","):
try:
prio = prios[cat]
except KeyError:
if cat not in unknown_cat:
print("Warning: Unknown categorie '%s' found and ignored.\n" % cat, file=sys.stderr)
unknown_cat.add(cat)
continue
# Check if the category has a highest priority than the current one
if prio > cur_prio:
cur_prio = prio
cur_cat = {cat}
if prio == cur_prio:
cur_cat.add(cat)
nb_cat = len(cur_cat)
if options.keep_ambiguous and nb_cat != 1:
# Increment "ambiguous" counter if more than 1 category
try:
cpt[("ambiguous", "ambiguous")] += (int(stop) - int(start)) * coverage / (float(nb_biot))
except KeyError:
cpt[("ambiguous", "ambiguous")] = (int(stop) - int(start)) * coverage / (float(nb_biot))
else:
# Increase each counts by the coverage divided by the number of categories and biotypes
for cat in cur_cat:
try:
cpt[(cat, biot)] += (int(stop) - int(start)) * coverage / (float(nb_biot * nb_cat))
except KeyError:
cpt[(cat, biot)] = (int(stop) - int(start)) * coverage / (float(nb_biot * nb_cat))
else:
# TODO: Add an option to provide biotype priorities and handle it
pass
def read_index(genome_index, lengths, cpt_genome, options, prios, index_chrom_list, biotype_prios, unknown_cat):
""" Parse index files info (chromosomes list, lengths and genome features). """
with open(genome_index, "r") as index_fh:
for line in index_fh:
if line.startswith("#"):
lengths[line.split("\t")[0][1:]] = int(line.split("\t")[1])
else:
chrom = line.split("\t")[0]
if chrom not in index_chrom_list:
index_chrom_list.append(chrom)
count_genome_features(cpt_genome, line.rstrip().split("\t")[4:], line.split("\t")[1], line.split("\t")[2], biotype_prios, options, prios, unknown_cat)
def run_genomecov(arguments):
""" Run genomecov (from Bedtools through pybedtools lib) for a set of parameters to produce a BedGraph file. """
(strand, bam_file, sample_label, name, bedgraph_extension, options) = arguments
# Load the BAM file
input_file = pybedtools.BedTool(bam_file)
# Run genomecov
if strand == "":
input_file.genome_coverage(bg=True, split=True).saveas(options.output_dir + sample_label + name + bedgraph_extension)
else:
input_file.genome_coverage(bg=True, split=True, strand=strand).saveas(options.output_dir + sample_label + name + bedgraph_extension)
return None
def generate_bedgraph_files_parallel(sample_labels, bam_files, options, bedgraph_extension):
""" Creates, through multi-processors, BedGraph files from BAM ones. """
# Sorting the BAM file on size to process the biggest first
files = list(zip(sample_labels, bam_files, [os.stat(i).st_size for i in bam_files], [options] * len(sample_labels)))
files.sort(key=lambda p: p[2], reverse=True)
# Defining parameters sets to provide to the genomecov instances to run
parameter_sets = []
for l, b, s, o in files:
# If the dataset is stranded, one BedGraph file for each strand is created
if options.strandness in ["forward", "fr-firststrand"]:
parameter_sets.append(["+", b, l, ".plus", bedgraph_extension, o])
parameter_sets.append(["-", b, l, ".minus", bedgraph_extension, o])
elif options.strandness in ["reverse", "fr-secondstrand", bedgraph_extension, o]:
parameter_sets.append(["-", b, l, ".plus", bedgraph_extension, o])
parameter_sets.append(["+", b, l, ".minus", bedgraph_extension, o])
else:
parameter_sets.append(["", b, l, "", bedgraph_extension, o])
# Setting the progressbar
pbar = progressbar.ProgressBar(widgets=["Generating the BedGraph files ", progressbar.Percentage(), progressbar.Bar(), progressbar.SimpleProgress(), "|", progressbar.Timer()], maxval=len(parameter_sets)).start()
# Setting the processors number
pool = Pool(options.nb_processors)
# Running the instances
list(pbar(pool.imap_unordered(run_genomecov, parameter_sets)))
pybedtools.cleanup() # If pybedtools can't finish properly but the program is not stopped, the /tmp will be cleaned anyway from pybedtools temp files
return None
def read_gtf(gtf_index_file, sign):
global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_features, endGTF
strand = ""
while strand != sign:
gtf_line = gtf_index_file.readline()
# If the GTF file is finished
if not gtf_line:
endGTF = True
return endGTF
splitline = gtf_line.rstrip().split("\t")
try:
strand = splitline[3]
# strand information can not be found in the file header
except IndexError:
pass
gtf_chrom = splitline[0]
gtf_features = splitline[4:]
gtf_start, gtf_stop = list(map(int, splitline[1:3]))
return endGTF
def intersect_bedgraphs_and_index_to_count_categories_1_file(arguments):
global gtf_line, gtf_chrom, gtf_start, gtf_stop, gtf_cat, endGTF
(sample_labels, bedgraph_files, biotype_prios, bedgraph_extension, genome_index, options, prios, index_chrom_list, unknown_cat, strand, sign) = arguments # For Python 3
unknown_chrom = []
cpt = {} # Counter for the nucleotides in the BAM input file(s)
prev_chrom = ""
endGTF = False # Reaching the next chr or the end of the GTF index
intergenic_adds = 0.0
# Checking if the BedGraph file is empty
if os.stat(bedgraph_files + strand + bedgraph_extension).st_size == 0:
return sample_labels, sign, {}, []
with open(bedgraph_files + strand + bedgraph_extension, "r") as bedgraph_fh:
# Running through the BedGraph file
for bam_line in bedgraph_fh:
# Getting the BAM line info
bam_chrom = bam_line.split("\t")[0]
bam_start, bam_stop, bam_cpt = list(map(float, bam_line.split("\t")[1:4]))
# Skip the line if the chromosome is not in the index
if bam_chrom not in index_chrom_list:
if bam_chrom not in unknown_chrom:
unknown_chrom.append(bam_chrom)
continue
# If this is a new chromosome (or the first one)
if bam_chrom != prev_chrom:
intergenic_adds = 0.0
# Closing the GTF file if it was open (exception caught only for the first chr)
try:
gtf_index_file.close()
except UnboundLocalError:
pass
# (Re)opening the GTF index and looking for the first line of the matching chr
gtf_index_file = open(genome_index, "r")
endGTF = False
read_gtf(gtf_index_file, sign)
while bam_chrom != gtf_chrom:
read_gtf(gtf_index_file, sign)
if endGTF:
break
prev_chrom = bam_chrom
# Looking for the first matching annotation in the GTF index
while (not endGTF) and (gtf_chrom == bam_chrom) and (bam_start >= gtf_stop):
read_gtf(gtf_index_file, sign)
if gtf_chrom != bam_chrom:
endGTF = True
# Processing BAM lines before the first GTF annotation if there are
if bam_start < gtf_start:
# Increase the "intergenic" category counter with all or part of the BAM interval
try:
intergenic_adds += min(bam_stop, gtf_start) - bam_start
cpt[("intergenic", "intergenic")] += (min(bam_stop, gtf_start) - bam_start) * bam_cpt
except KeyError:
cpt[("intergenic", "intergenic")] = (min(bam_stop, gtf_start) - bam_start) * bam_cpt
# Go to next line if the BAM interval was totally upstream the first GTF annotation, carry on with the remaining part otherwise
if endGTF or (bam_stop <= gtf_start):
continue
else:
bam_start = gtf_start
# We can start the crossover
while not endGTF:
# Update category counter
count_genome_features(cpt, gtf_features, bam_start, min(bam_stop, gtf_stop), biotype_prios, options, prios, unknown_cat, coverage=bam_cpt)
# Read the next GTF file line if the BAM line is not entirely covered
if bam_stop > gtf_stop:
# Update the BAM start pointer
bam_start = gtf_stop
endGTF = read_gtf(gtf_index_file, sign)
# If we read a new chromosome in the GTF file then it is considered finished
if bam_chrom != gtf_chrom:
endGTF = True
if endGTF:
break
else:
# Next if the BAM line is entirely covered
bam_start = bam_stop
break
# Processing the end of the BAM line if necessary
if endGTF and (bam_stop > bam_start):
try:
cpt[("intergenic", "intergenic")] += (bam_stop - bam_start) * bam_cpt
except KeyError:
cpt[("intergenic", "intergenic")] = (bam_stop - bam_start) * bam_cpt
# In stranded mode, if one of the BedGraph files doesn't have any of the chromosomes from the reference file, the error is not detected during the preprocessing (a chromosome is found within the other BedGraph file)
try:
gtf_index_file.close()
except UnboundLocalError:
# Then the file is not opened and can't be closed
pass
return sample_labels, sign, cpt, unknown_chrom
def intersect_bedgraphs_and_index_to_count_categories(sample_labels, bedgraph_files, options, bedgraph_extension, genome_index, prios, index_chrom_list, unknown_cat, biotype_prios=None): ## MB: To review
# Initializing variables
unknown_chrom = []
cpt = {} # Counter for the nucleotides in the BAM input file(s)
if bedgraph_files == []:
bedgraph_files = [options.output_dir + s for s in sample_labels]
if options.strandness == "unstranded":
strands = [("", ".")]
else:
strands = [(".plus", "+"), (".minus", "-")]
# Initializing the progress bar
pbar = progressbar.ProgressBar(widgets=["Intersecting BAM and genome ", progressbar.Percentage(), " ", progressbar.Bar(), progressbar.SimpleProgress(), "|", progressbar.Timer()], maxval=len(sample_labels) * len(strands)).start()
pool = Pool(options.nb_processors)
inputs = [sample + strand for sample in zip(sample_labels, bedgraph_files, [biotype_prios] * len(sample_labels), [bedgraph_extension] * len(sample_labels), [genome_index] * len(sample_labels), [options] * len(sample_labels), [prios] * len(sample_labels), [index_chrom_list] * len(sample_labels), [unknown_cat] * len(sample_labels)) for strand in strands]
# Running the intersection in parallel
results = list(pbar(pool.imap_unordered(intersect_bedgraphs_and_index_to_count_categories_1_file, inputs)))
"""
# Non-parallel version for debugging
results = []
for i in inputs:
results.append(intersect_bedgraphs_and_index_to_count_categories_1_file(i))
"""
# Reformatting the results
for res in results:
init_dict(cpt, res[0], {})
if res[1] != {}:
init_dict(cpt[res[0]], res[1], res[2])
unknown_chrom.append(res[3])
# Merging strands counts for the same samples
final_cpt = {}
for sample in cpt:
final_cpt[sample] = {}
for strand in strands:
for feat in cpt[sample][strand[1]]:
try:
final_cpt[sample][feat] += cpt[sample][strand[1]][feat]
except KeyError:
final_cpt[sample][feat] = cpt[sample][strand[1]][feat]
print("Unknown chromosomes: " + str(set([i for u in unknown_chrom for i in u])) + ".")
return final_cpt
def write_counts_in_files(cpt, genome_counts, output_dir):
""" Writes the biotype/category counts in an output file. """
for sample_label, counters in list(cpt.items()):
sample_label = "_".join(re.findall(r"[\w\-']+", sample_label))
with open(output_dir + sample_label + ".ALFA_feature_counts.tsv", "w") as output_fh:
output_fh.write("#Category,biotype\tCounts_in_BAM/BedGraph\tSize_in_genome\n")
for features_pair, counts in list(counters.items()):
output_fh.write("%s\t%s\t%s\n" % (",".join(features_pair), counts, genome_counts[features_pair]))
def read_counts(sample_labels, counts_files):
""" Reads the counts from an input file. """
cpt = {}
cpt_genome = {}
for sample_label, filename in zip(sample_labels, counts_files):
cpt[sample_label] = {}
with open(filename, "r") as counts_fh:
for line in counts_fh:
if not line.startswith("#"):
feature = tuple(line.split("\t")[0].split(","))
cpt[sample_label][feature] = float(line.split("\t")[1])
cpt_genome[feature] = float(line.rstrip().split("\t")[2])
return cpt, cpt_genome
def group_counts_by_categ(cpt, cpt_genome, final, selected_biotype):
final_cat_cpt = {}
final_genome_cpt = {}
filtered_cat_cpt = {}
for f in cpt:
final_cat_cpt[f] = {}
filtered_cat_cpt[f] = {}
for final_cat in final:
tot = 0
tot_filter = 0
tot_genome = 0
for cat in final[final_cat]:
for key, value in list(cpt[f].items()):
if key[0] == cat:
tot += value
tot_genome += cpt_genome[key]
if key[1] == selected_biotype:
tot_filter += value
# output_file.write('\t'.join((final_cat, str(tot))) + '\n')
# print '\t'.join((final_cat, str(tot)))
final_cat_cpt[f][final_cat] = tot
if tot_genome == 0:
final_genome_cpt[final_cat] = 1e-100
else:
final_genome_cpt[final_cat] = tot_genome
filtered_cat_cpt[f][final_cat] = tot_filter
# if "antisense" in final_genome_cpt: final_genome_cpt["antisense"] = 0
return final_cat_cpt, final_genome_cpt, filtered_cat_cpt
def group_counts_by_biotype(cpt, cpt_genome, biotypes):
final_cpt = {}
final_genome_cpt = {}
for f in cpt:
final_cpt[f] = {}
for biot in biotypes:
tot = 0
tot_genome = 0
try:
for final_biot in biotypes[biot]:
for key, value in list(cpt[f].items()):
if key[1] == final_biot:
tot += value
# if key[1] != 'antisense':
tot_genome += cpt_genome[key]
except:
for key, value in list(cpt[f].items()):
if key[1] == biot:
tot += value
tot_genome += cpt_genome[key]
if tot != 0:
final_cpt[f][biot] = tot
final_genome_cpt[biot] = tot_genome
return final_cpt, final_genome_cpt
def display_percentage_of_ambiguous(cpt, count_files_option=False):
if count_files_option:
print("INFO: Ambiguous counts were discarded in at least one sample\n" \
+ " (see --ambiguous option for more information)")
else:
print("INFO: Reads matching ambiguous annotation have been discarded.\n" \
+ " To change this option, please see \"--ambiguous\" help.")
print("INFO: Percentage of ambiguous counts:")
# Loop for each sample
for sample, counts in cpt.items():
# Compute and display the percentage of ambiguous counts
try:
ambiguous = counts[('ambiguous', 'ambiguous')]
total = sum([count for feat, count in counts.items() if 'intergenic' not in feat])
print(" {!s:25.25} {:5.2f}% of ambiguous".format(sample, float(ambiguous / total) * 100))
# If ambiguous is not in the count file
except KeyError:
if count_files_option:
print(
" {!s:25.25} {:3.2f}% of ambiguous (this sample may have been processed with --ambiguous option)".format(sample, 0))
else:
print(" {!s:25.25} {:3.2f}% of ambiguous".format(sample, 0))
def recategorize_the_counts(cpt, cpt_genome, final):
final_cat_cpt = {}
final_genome_cpt = {}
for f in cpt:
# print "\nFinal categories for",f,"sample"
final_cat_cpt[f] = {}
for final_cat in final:
tot = 0
tot_genome = 0
for cat in final[final_cat]:
tot += cpt[f][cat]
tot_genome += cpt_genome[cat]
# output_file.write('\t'.join((final_cat, str(tot))) + '\n')
# print '\t'.join((final_cat, str(tot)))
final_cat_cpt[f][final_cat] = tot
final_genome_cpt[final_cat] = tot_genome
return final_cat_cpt, final_genome_cpt
def one_sample_plot(ordered_categs, percentages, enrichment, n_cat, index, index_enrichment, bar_width, counts_type,
title, sample_labels):
### Initialization
fig = plt.figure(figsize=(13, 9))
ax1 = plt.subplot2grid((2, 4), (0, 0), colspan=2)
ax2 = plt.subplot2grid((2, 4), (1, 0), colspan=2)
cmap = plt.get_cmap("Spectral")
cols = [cmap(x) for x in range(0, 256, 256 / n_cat)]
if title:
ax1.set_title(title + "in: %s" % sample_labels[0])
else:
ax1.set_title(counts_type + " distribution in mapped reads in: %s" % sample_labels[0])
ax2.set_title("Normalized counts of " + counts_type)
### Barplots
# First barplot: percentage of reads in each categorie
ax1.bar(index, percentages, bar_width,
color=cols)
# Second barplot: enrichment relative to the genome for each categ
# (the reads count in a categ is divided by the categ size in the genome)
ax2.bar(index_enrichment, enrichment, bar_width,
color=cols, )
### Piecharts
pielabels = [ordered_categs[i] if percentages[i] > 0.025 else "" for i in range(n_cat)]
sum_enrichment = np.sum(enrichment)
pielabels_enrichment = [ordered_categs[i] if enrichment[i] / sum_enrichment > 0.025 else "" for i in range(n_cat)]
# Categories piechart
ax3 = plt.subplot2grid((2, 4), (0, 2))
pie_wedge_collection, texts = ax3.pie(percentages, labels=pielabels, shadow=True, colors=cols)
# Enrichment piechart
ax4 = plt.subplot2grid((2, 4), (1, 2))
pie_wedge_collection, texts = ax4.pie(enrichment, labels=pielabels_enrichment, shadow=True, colors=cols)
# Add legends (append percentages to labels)
labels = [" ".join((ordered_categs[i], "({:.1%})".format(percentages[i]))) for i in range(len(ordered_categs))]
ax3.legend(pie_wedge_collection, labels, loc="center", fancybox=True, shadow=True, prop={"size": "medium"},
bbox_to_anchor=(1.7, 0.5))
labels = [" ".join((ordered_categs[i], "({:.1%})".format(enrichment[i] / sum_enrichment))) for i in
range(len(ordered_categs))] # if ordered_categs[i] != "antisense"]
ax4.legend(pie_wedge_collection, labels, loc="center", fancybox=True, shadow=True, prop={"size": "medium"},
bbox_to_anchor=(1.7, 0.5))
# Set aspect ratio to be equal so that pie is drawn as a circle
ax3.set_aspect("equal")
ax4.set_aspect("equal")
return fig, ax1, ax2
def make_plot(sample_labels, ordered_categs, categ_counts, genome_counts, counts_type, options, title=None, categ_groups=[]):
#Test matplotlib version. If __version__ >= 2, use a shift value to correct the positions of bars and xticks
if int(matplotlib.__version__[0]) >= 2:
shift_mpl = 0.5
else:
shift_mpl = 0
# From ordered_categs, keep only the features (categs or biotypes) that we can find in at least one sample.
existing_categs = set()
for sample in list(categ_counts.values()):
existing_categs |= set(sample.keys())
ordered_categs = list(filter(existing_categs.__contains__, ordered_categs))
xlabels = [cat if len(cat.split("_")) == 1 else "\n".join(cat.split("_")) if cat.split("_")[0] != 'undescribed' else "\n".join(["und.",cat.split("_")[1]]) for cat in ordered_categs]
n_cat = len(ordered_categs)
#n_exp = len(sample_names)
nb_samples = len(categ_counts)
##Initialization of the matrix of counts (nrow=nb_experiements, ncol=nb_categorie)
#counts = np.matrix(np.zeros(shape=(n_exp, n_cat)))
counts = np.matrix(np.zeros(shape=(nb_samples, n_cat)))
'''
for exp in xrange(len(sample_names)):
for cat in xrange(len(ordered_categs)):
try:
counts[exp, cat] = categ_counts[sample_names[exp]][ordered_categs[cat]]
except:
pass
'''
for sample_label in sample_labels:
for cat in range(len(ordered_categs)):
try:
counts[sample_labels.index(sample_label), cat] = categ_counts[sample_label][ordered_categs[cat]]
except:
pass
##Normalize the categorie sizes by the total size to get percentages
sizes = []
sizes_sum = 0
for cat in ordered_categs:
sizes.append(genome_counts[cat])
sizes_sum += genome_counts[cat]
if "opposite_strand" in ordered_categs:
antisense_pos = ordered_categs.index("opposite_strand")
sizes[antisense_pos] = 1e-100
for cpt in range(len(sizes)):
sizes[cpt] /= float(sizes_sum)
## Create array which contains the percentage of reads in each categ for every sample
percentages = np.array(counts / np.sum(counts, axis=1))
## Create the enrichment array (counts divided by the categorie sizes in the genome)
enrichment = np.array(percentages / sizes)
if "antisense_pos" in locals():
'''
for i in xrange(len(sample_names)):
enrichment[i][antisense_pos] = 0
'''
for n in range(nb_samples):
enrichment[n][antisense_pos] = 0
# enrichment=np.log(np.array(percentages/sizes))
#for exp in xrange(n_exp):
for n in range(nb_samples):
for i in range(n_cat):
val = enrichment[n][i]
if val > 1:
enrichment[n][i] = val - 1
elif val == 1 or val == 0:
enrichment[n][i] = 0
else:
enrichment[n][i] = -1 / val + 1
#### Finally, produce the plot
##Get the colors from the colormap
ncolor = 16
cmap = ["#e47878", "#68b4e5", "#a3ea9b", "#ea9cf3", "#e5c957", "#a3ecd1", "#e97ca0", "#66d985", "#8e7ae5",
"#b3e04b", "#b884e4", "#e4e758", "#738ee3", "#e76688", "#70dddd", "#e49261"]
'''
if n_exp > ncolor:
cmap = plt.get_cmap("Set3", n_exp)
cmap = [cmap(i) for i in xrange(n_exp)]
'''
if nb_samples > ncolor:
cmap = plt.get_cmap("tab20", nb_samples)
cmap = [cmap(i) for i in range(nb_samples)]
## Parameters for the plot
opacity = 1
# Create a vector which contains the position of each bar
index = np.arange(n_cat)
# Size of the bars (depends on the categs number)
#bar_width = 0.9 / n_exp
bar_width = 0.9 / nb_samples
##Initialise the subplot
# if there is only one sample, also plot piecharts
# if n_exp == 1 and counts_type.lower() == 'categories':
# fig, ax1, ax2 = one_sample_plot(ordered_categs, percentages[0], enrichment[0], n_cat, index, bar_width, counts_type, title)
## If more than one sample
# else:
if counts_type.lower() != "categories":
#fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * n_exp) / 3, 10))
fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * nb_samples) / 3, 10))
else:
#fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * n_exp) / 3, 10))
fig, (ax1, ax2) = plt.subplots(2, figsize=(5 + (n_cat + 2 * nb_samples) / 3, 10))
# Store the bars objects for percentages plot
rects = []
# Store the bars objects for enrichment plot
rects_enrichment = []
# For each sample/experiment
#for i in range(n_exp):
for sample_label in sample_labels:
# First barplot: percentage of reads in each categorie
n = sample_labels.index(sample_label)
#ax1.bar(index + i * bar_width, percentages[i], bar_width,
rects.append(ax1.bar(index + n * bar_width + shift_mpl/nb_samples, percentages[n], bar_width,
alpha=opacity,
#color=cmap[i],
color=cmap[n],
#label=sample_names[i], edgecolor="#FFFFFF", lw=0)
label=sample_label, edgecolor="#FFFFFF", lw=0))
# Second barplot: enrichment relative to the genome for each categ
# (the reads count in a categ is divided by the categ size in the genome)
rects_enrichment.append(ax2.bar(index + n * bar_width + shift_mpl/nb_samples, enrichment[n], bar_width,
alpha=opacity,
#color=cmap[i],
color=cmap[n],
#label=sample_names[i], edgecolor=cmap[i], lw=0))
label=sample_label, edgecolor=cmap[n], lw=0))
## Graphical options for the plot
# Adding of the legend
#if n_exp < 10:
if nb_samples < 10:
ax1.legend(loc="best", frameon=False)
legend_ncol = 1
#elif n_exp < 19:
elif nb_samples < 19:
legend_ncol = 2
else:
legend_ncol = 3
ax1.legend(loc="best", frameon=False, ncol=legend_ncol)
ax2.legend(loc="best", frameon=False, ncol=legend_ncol)
# ax2.legend(loc='upper center',bbox_to_anchor=(0.5,-0.1), fancybox=True, shadow=True)
# Main titles
if title:
ax1.set_title(title)
else:
ax1.set_title(counts_type + " counts")
ax2.set_title(counts_type + " normalized counts")
# Adding enrichment baseline
# ax2.axhline(y=0,color='black',linestyle='dashed',linewidth='1.5')
# Axes limits
ax1.set_xlim(-0.1, len(ordered_categs) + 0.1)
if len(sizes) == 1: ax1.set_xlim(-2, 3)