-
Notifications
You must be signed in to change notification settings - Fork 0
/
genome_wide_distribution.py
138 lines (106 loc) · 4.26 KB
/
genome_wide_distribution.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
### Boas Pucker ###
### bpucker@cebitec.uni-bielefeld.de ###
### v0.1 ###
__usage__ = """
python genome_wide_distribution.py
--in <SVIM_VCF_FILE>
--ref <REFERENCE_SEQUENCE>
--fig <FIGURE_NAME_WITH_PDF_EXTENSION>
optional:
--score <INT, minimal SVIM score to consider variant>[10]
bug reports and feature requests: bpucker@cebitec.uni-bielefeld.de
"""
import os, glob, re, sys
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
from operator import itemgetter
# --- end of imports --- #
def load_seq_lengths( multiple_fasta_file ):
"""! @brief load candidate gene IDs from file """
seq_lens = {}
with open( multiple_fasta_file ) as f:
header = f.readline()[1:].strip().split(' ')[0]
seq = ""
line = f.readline()
while line:
if line[0] == '>':
seq_lens.update( { header: len( seq ) } )
header = line.strip()[1:].split(' ')[0]
seq = ""
else:
seq += line.strip()
line = f.readline()
seq_lens.update( { header: len( seq ) } )
return seq_lens
def plot_genome_wide_distribution( features_to_plot, chr_lengths, fig_output_file ):
"""! @brief show genome wide distribution of genes """
# --- construct plot --- #
fig, ax = plt.subplots( ) #figsize=( 5, 10 )
chr_names = sorted( chr_lengths.keys() )
y_offset = len( chr_names )
# --- adding chromosomes --- #
for idx, each in enumerate( chr_names ):
ax.plot( [ 0, chr_lengths[ each ]/1000000.0 ], [ y_offset-idx, y_offset-idx ] , color="black", linewidth=.5 )
ax.text( chr_lengths[ each ]/1000000.0 , y_offset-idx, each, fontsize=5 )
# --- adding gene positions --- #
for gene in features_to_plot:
y = y_offset-chr_names.index( gene['chr'] ) + gene['group']*0.05
x = gene['pos'] / 1000000.0
ax.scatter( x, y, s=1, color=gene['color'] )
# --- adding legend --- #
my_legend = [ mpatches.Patch( color="lime", label="DUP:TANDEM" ),
mpatches.Patch( color="magenta", label="INV" ),
mpatches.Patch( color="blue", label="INS:NOVEL" ),
mpatches.Patch( color="purple", label="DEL" ),
mpatches.Patch( color="black", label="DUP:INT" )
]
ax.legend( handles=my_legend, prop={'size': 5} )
# --- improving overall layout --- #
ax.set_xlabel( "chromosome position [Mbp]" )
ax.spines["top"].set_visible(False)
ax.spines["left"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_frame_on(False)
ax.axes.get_yaxis().set_visible(False)
ax.set_ylim( 0, len( chr_names )+1 )
plt.subplots_adjust( left=0.0, right=0.98, top=1.0, bottom=0.15 )
fig.savefig( fig_output_file, dpi=300 )
def load_features_from_vcf( vcf_file, score_cutoff ):
"""! @brief load all features from given SVIM VCF file """
features = []
with open( vcf_file, "r" ) as f:
line = f.readline()
while line:
if line[0] != '#':
parts = line.strip().split('\t')
if int( parts[5] ) >= score_cutoff:
if parts[ 4 ] == "<DUP:TANDEM>":
features.append( { 'chr': parts[0], 'pos': int( parts[1] ), 'color': "lime", 'group': 1 } )
elif parts[ 4 ] == "<INV>":
features.append( { 'chr': parts[0], 'pos': int( parts[1] ), 'color': "magenta", 'group': 2 } )
elif parts[ 4 ] == "<INS:NOVEL>":
features.append( { 'chr': parts[0], 'pos': int( parts[1] ), 'color': "blue", 'group': 3 } )
elif parts[ 4 ] == "<DEL>":
features.append( { 'chr': parts[0], 'pos': int( parts[1] ), 'color': "purple", 'group': 4 } )
elif parts[ 4 ] == "<DUP:INT>":
features.append( { 'chr': parts[0], 'pos': int( parts[1] ), 'color': "black", 'group': 5 } )
else:
print line
line = f.readline()
return features
def main( arguments ):
"""! @brief run everything """
vcf_file = arguments[ arguments.index( '--in' )+1 ]
fig_output_file = arguments[ arguments.index( '--fig' )+1 ]
genome_seq_file = arguments[ arguments.index( '--ref' )+1 ]
if '--score' in arguments:
score_cutoff = int( arguments[ arguments.index( '--score' )+1 ] )
else:
score_cutoff = 10
features_to_plot = load_features_from_vcf( vcf_file, score_cutoff )
chr_lengths = load_seq_lengths( genome_seq_file )
plot_genome_wide_distribution( features_to_plot, chr_lengths, fig_output_file )
if '--in' in sys.argv and '--ref' in sys.argv and '--fig' in sys.argv:
main( sys.argv )
else:
sys.exit( __usage__ )