-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexe3_global_alignment.py
143 lines (121 loc) · 5.02 KB
/
exe3_global_alignment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from collections import defaultdict
from copy import deepcopy
import numpy as np
DIAG = 2
LEFT = 3
UP = 4
CONTAINS_DIAG = [DIAG, DIAG + LEFT, DIAG + UP, DIAG + LEFT + UP]
CONTAINS_LEFT = [LEFT, LEFT + DIAG, LEFT + UP, LEFT + DIAG + UP]
CONTAINS_UP = [UP, UP + DIAG, UP + LEFT, UP + DIAG + LEFT]
class GlobalAlignment:
def __init__(self, string1, string2, gap_penalty, matrix):
"""
:param string1: first string to be aligned, string
:param string2: second string to be aligned, string
:param gap_penalty: gap penalty, integer
:param matrix: substitution matrix containing scores for amino acid
matches and mismatches, dict
Attention! string1 is used to index columns, string2 is used to index rows
"""
self.string1 = string1
self.string2 = string2
self.gap_penalty = gap_penalty
self.subst_mat = matrix
self.score_mat = None
self.alignments = []
self.align()
def align(self):
"""
Align given strings using the Needleman-Wunsch algorithm,
store the alignments and the score matrix used to compute those alignments.
NB: score matrix and the substitution matrix are different matrices!
"""
x = self.string1
y = self.string2
nx = len(x)
ny = len(y)
# Optimal score at each possible pair of characters.
score_mat = np.zeros((nx + 1, ny + 1), dtype=int)
score_mat[:, 0] = np.linspace(0, nx, nx + 1) * self.gap_penalty
score_mat[0, :] = np.linspace(0, ny, ny + 1) * self.gap_penalty
# Pointers to trace through an optimal aligment.
point_mat = np.zeros((nx + 1, ny + 1), dtype=int)
point_mat[:, 0] = UP
point_mat[0, :] = LEFT
# Temporary scores.
t = np.zeros(3)
for i in range(nx):
for j in range(ny):
t[0] = score_mat[i, j] + self.subst_mat[x[i]][y[j]]
t[1] = score_mat[i, j + 1] + self.gap_penalty
t[2] = score_mat[i + 1, j] + self.gap_penalty
max_t = np.max(t)
score_mat[i + 1, j + 1] = max_t
if t[0] == max_t:
point_mat[i + 1, j + 1] += DIAG
if t[1] == max_t:
point_mat[i + 1, j + 1] += UP
if t[2] == max_t:
point_mat[i + 1, j + 1] += LEFT
def rec(i, j, rx, ry, idx, ridx):
if i <= 0 and j <= 0:
ridx.append(idx)
return rx, ry
idx1 = idx + (point_mat[i, j],)
rx.setdefault(idx, {''})
ry.setdefault(idx, {''})
rx.setdefault(idx1, set())
ry.setdefault(idx1, set())
rx1_ry1_i = []
if point_mat[i, j] in CONTAINS_DIAG:
rx_diag = deepcopy(rx)
ry_diag = deepcopy(ry)
rx_diag[idx1] |= {x[i - 1] + rx_idx_i for rx_idx_i in rx_diag[idx]}
ry_diag[idx1] |= {y[j - 1] + ry_idx_i for ry_idx_i in ry_diag[idx]}
rx1_ry1_i.append(rec(i - 1, j - 1, rx_diag, ry_diag, idx1, ridx))
if point_mat[i, j] in CONTAINS_LEFT:
rx_left = deepcopy(rx)
ry_left = deepcopy(ry)
rx_left[idx1] |= {'-' + rx_idx_i for rx_idx_i in rx_left[idx]}
ry_left[idx1] |= {y[j - 1] + ry_idx_i for ry_idx_i in ry_left[idx]}
rx1_ry1_i.append(rec(i, j - 1, rx_left, ry_left, idx1, ridx))
if point_mat[i, j] in CONTAINS_UP:
rx_up = deepcopy(rx)
ry_up = deepcopy(ry)
rx_up[idx1] |= {x[i - 1] + rx_idx_i for rx_idx_i in rx_up[idx]}
ry_up[idx1] |= {'-' + ry_idx_i for ry_idx_i in ry_up[idx]}
rx1_ry1_i.append(rec(i - 1, j, rx_up, ry_up, idx1, ridx))
rx1 = defaultdict(set)
ry1 = defaultdict(set)
for rx1_i, ry1_i in rx1_ry1_i:
for key, value in rx1_i.items():
rx1[key] |= value
for key, value in ry1_i.items():
ry1[key] |= value
return rx1, ry1
ridx = []
rx, ry = rec(i=nx, j=ny, rx={}, ry={}, idx=(), ridx=ridx)
for idx in ridx:
self.alignments.append((''.join(rx[idx]), ''.join(ry[idx])))
self.score_mat = score_mat
def get_best_score(self):
"""
:return: the highest score for the aligned strings, int
"""
return self.score_mat[-1, -1]
def get_number_of_alignments(self):
"""
:return: number of found alignments with the best score
"""
return len(self.alignments)
def get_alignments(self):
"""
:return: list of alignments, where each alignment is represented
as a tuple of aligned strings
"""
return self.alignments
def get_score_matrix(self):
"""
:return: matrix built during the alignment process as a list of lists
"""
return self.score_mat.T