-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathto_tabformat.py
228 lines (196 loc) · 6.99 KB
/
to_tabformat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
#!/usr/bin/env python
######################################################################################
#
## NessieOutParser.py
# Script to format the output from NeSSie related to Palindromes, Mirrors or
# Triplexes into a tab formatted file.
#
# Author: Michele Berselli
# University of Padova
# berselli.michele@gmail.com
#
## LICENSE:
# Copyright (C) 2017 Michele Berselli
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
######################################################################################
# Import libraries
import sys, argparse, re
# Functions definition
def routine_score(seq_len, align):
# Variables
score_increment, score_idls, score_mm = -2, -1, -1
score = seq_len
if align:
aln, last_aln = '', ''
for i in range(0, len(align), 2):
aln += align[i]
aln += align[i + 1]
if aln == '00':
if last_aln == '01':
score += score_increment
#score_increment -= 1
else:
score += score_idls
#end if
elif aln == '11':
if last_aln == '01':
score += score_increment
#score_increment -= 1
else:
score += score_idls
#end if
elif aln == '10':
if last_aln == '01':
score += score_increment
#score_increment -= 1
else:
score += score_mm
#end if
#end if
last_aln = aln
aln = ''
#end for
#end if
return score
#end def routine_score
def print_tab(dict_to_print, of, strand, seq_type, flag, counts_only, indexes_only):
if flag == 0: #0=indexes, 1=counts, 2=score
for (key, value) in sorted(dict_to_print.iteritems(), key=lambda(x, y): y['indexes'][0]):
routine_print_tab(key, value, of, strand, seq_type, counts_only, indexes_only)
#end for
elif flag == 1:
for (key, value) in sorted(dict_to_print.iteritems(), key=lambda(x, y): y['counts'], reverse=True):
routine_print_tab(key, value, of, strand, seq_type, counts_only, indexes_only)
#end for
else:
for (key, value) in sorted(dict_to_print.iteritems(), key=lambda(x, y): y['score'], reverse=True):
routine_print_tab(key, value, of, strand, seq_type, counts_only, indexes_only)
#end for
#end if
#end def print_tab
def routine_print_tab(key, value, of, strand, seq_type, counts_only, indexes_only):
#fasta_ID\tmotif\tmotif_type\tstrand\tmotif_length\tscore\tcounts\tindexes\n'
of.write('{0}\t{1}\t'.format(value['seq_ID'], key))
if seq_type == 0: #0=mirror, 1=palindrome, 2=triplex
of.write('m\t{0}\t'.format(strand))
elif seq_type == 1:
of.write('p\t{0}\t'.format(strand))
else:
of.write('t\t{0}\t'.format(strand))
#end if
of.write('{0}\t{1}'.format(value['seq_len'], value['score']))
if not indexes_only:
of.write('\t{0}'.format(value['counts']))
#end if
if not counts_only:
of.write('\t')
of.write(','.join(str(idx) for idx in value['indexes']))
#end if
of.write('\n')
#end def routine_print_tab
def main(args):
## Variables
flag_score = True if args['score'] else False
flag_counts = True if args['orderbycounts'] else False
counts_only = False
indexes_only = False
seq_type = 0 #0=mirror, 1=palindrome, 2=triplex
strand = '+'
## Init data structures
dict_hits = {} # {seq: {seq_ID: 'i', seq_len: s, score: n, counts: c, indexes: [i, ...]}, ...}
## Opening output file
of = open(args['outputfile'], 'w')
## Reading input file
with open(args['inputfile']) as fi:
first_seq = True
for line in fi:
if line.startswith('#'):
command_line = line.rstrip().split()
if ('-P' in command_line) or ('--palindrome' in command_line):
seq_type = 1
elif ('-T' in command_line) or ('--triplex' in command_line):
seq_type = 2
#end if
if ('-C' in command_line) or ('--complement' in command_line):
strand = '-'
#end if
if ('-c' in command_line) or ('--counts' in command_line):
flag_counts = True # having only counts hits will be ordered by counts
counts_only = True
elif ('-i' in command_line) or ('--indexes' in command_line):
if flag_counts:
print '\nTo order by counts provide output with counts, results will be ordered by indexes!\n'
#end if
indexes_only = True
flag_counts = False # having only indexes hits will be ordered by indexes
#end if
## Decide ordering used to print
if flag_score:
flag = 2 #0=indexes, 1=counts, 2=score
else:
if flag_counts:
flag = 1
else:
flag = 0
#end if
#end if
elif line.startswith('>') and first_seq:
seq_ID_space = re.sub(r'[^\w\s]', '', line.rstrip())
seq_ID = re.sub('\s+', '_', seq_ID_space)
first_seq = False
## Writing header
if counts_only:
of.write('#fasta_ID\tmotif\tmotif_type\tstrand\tmotif_length\tscore\tcounts\n')
elif indexes_only:
of.write('#fasta_ID\tmotif\tmotif_type\tstrand\tmotif_length\tscore\tindexes\n')
else:
of.write('#fasta_ID\tmotif\tmotif_type\tstrand\tmotif_length\tscore\tcounts\tindexes\n')
#end if
elif line.startswith('>') and not first_seq:
print_tab(dict_hits, of, strand, seq_type, flag, counts_only, indexes_only) #<----
seq_ID_space = re.sub(r'[^\w\s]', '', line.rstrip())
seq_ID = re.sub('\s+', '_', seq_ID_space)
dict_hits = {} # re-initializing dict_hits for new sequence
elif line.startswith('$'):
seq_len, seq = int(line.split('|')[1]), line.split('|')[2].rstrip()
if len(line.split('|')) == 4:
align = line.split('|')[3].rstrip()
else:
align = None
#end if
dict_hits.setdefault(seq, {'seq_ID': seq_ID, 'seq_len' : seq_len, 'score': routine_score(seq_len, align), 'counts': 0, 'indexes': []})
elif line.startswith('@counts'):
counts = int(line.rstrip().split()[1])
dict_hits[seq]['counts'] += counts
elif line.startswith('@indexes'):
indexes = map(lambda x: int(x), line.rstrip().split()[1].split('|')[:-1])
dict_hits[seq]['indexes'] += indexes
#end if
#end for
print_tab(dict_hits, of, strand, seq_type, flag, counts_only, indexes_only) #<----
#end if
#end with
## Closing output file
of.close()
#end def main
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Script to format the output from NeSSIe')
parser.add_argument('-i','--inputfile', help='output file from NeSSIe as input', required=True)
parser.add_argument('-o','--outputfile', help='file to store formatted output', required=True)
parser.add_argument('-c','--orderbycounts', help='order the results by counts and not by indexes', action='store_true', required=False)
parser.add_argument('-s','--score', help='order by score', action='store_true', required=False)
args = vars(parser.parse_args())
main(args)
#end if