-
Notifications
You must be signed in to change notification settings - Fork 2
/
pdst.py
84 lines (72 loc) · 1.9 KB
/
pdst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
VALID_CHARS = {'A', 'T', 'C', 'G'}
class DNA():
def __init__(self, name='', info='', content=None):
self.name = name
self.info = info
self.content = content
def __repr__(self):
return "name: {}\ninfo: {}\ncontent: {}".format(self.name, self.info, self.content)
def add_new_DNA(dna_list, line):
assert line[0] == '>'
first_space_idx = line.find(' ')
if first_space_idx != -1:
dna_name = line[1:first_space_idx]
dna_info = line[first_space_idx:].strip()
else:
dna_name = line[1:]
dna_info = ''
dna_list.append(DNA(name=dna_name, info=dna_info, content=[]))
def add_line_to_DNA(cur_DNA, line):
for x in line:
if x in VALID_CHARS:
cur_DNA.content.append(x)
elif x == ' ':
continue
else:
raise Exception()
def parse_FASTA(file):
"""
Basic state machine for parsing.
0 = Expecting '>' or empty line.
1 = Expecting valid string for current DNA or empty line.
2 = Got at least one string for current DNA. Ready for new DNA
or continue current DNA.
"""
state = 0
dna_list = []
for line in file:
line = line.strip()
if state == 0:
if line[0] == '>':
add_new_DNA(dna_list, line)
state = 1
elif line == '':
continue
else:
raise Exception()
elif state == 1:
add_line_to_DNA(dna_list[-1], line)
state = 2
elif state == 2:
if line[0] == '>':
add_new_DNA(dna_list, line)
state = 1
else:
add_line_to_DNA(dna_list[-1], line)
else:
raise Exception()
file.seek(0)
return dna_list
# GO HERE
with open('rosalind_pdst.txt') as file:
dna_list = parse_FASTA(file)
dna_list = [x.content for x in dna_list]
n = len(dna_list)
s_len = len(dna_list[0])
assert all([len(x) == s_len for x in dna_list])
D = [[0.0 for j in range(n)] for i in range(n)]
for i in range(n-1):
for j in range(i+1, n):
D[i][j] = D[j][i] = sum([dna_list[i][k] != dna_list[j][k] for k in range(s_len)]) / s_len
for row in D:
print(' '.join(['{:.5f}'.format(x) for x in row]))