-
Notifications
You must be signed in to change notification settings - Fork 312
/
suffix_array.py
99 lines (91 loc) · 2.4 KB
/
suffix_array.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
"""
Calculates the suffix array and LCP array in O(n) time
Example:
>>>> S = 'cabbage'
>>>> SA = SAIS([ord(c) for c in S])
>>>> LCP = KASAI(S, SA)
>>>> SA
[1, 4, 3, 2, 0, 6, 5]
>>>> LCP
[1, 0, 1, 0, 0, 0]
"""
def SAIS(A):
"""
Calculates suffix array in O(len(A) + max(A))
Input:
Int list A with A[i] >= 0 for all i
"""
n = len(A)
buckets = [0] * (max(A) + 2)
for a in A:
buckets[a + 1] += 1
for b in range(1, len(buckets)):
buckets[b] += buckets[b - 1]
isL = [1] * n
for i in reversed(range(n - 1)):
isL[i] = +(A[i] > A[i + 1]) if A[i] != A[i + 1] else isL[i + 1]
def induced_sort(LMS):
SA = [-1] * (n)
SA.append(n)
endpoint = buckets[1:]
for j in reversed(LMS):
endpoint[A[j]] -= 1
SA[endpoint[A[j]]] = j
startpoint = buckets[:-1]
for i in range(-1, n):
j = SA[i] - 1
if j >= 0 and isL[j]:
SA[startpoint[A[j]]] = j
startpoint[A[j]] += 1
SA.pop()
endpoint = buckets[1:]
for i in reversed(range(n)):
j = SA[i] - 1
if j >= 0 and not isL[j]:
endpoint[A[j]] -= 1
SA[endpoint[A[j]]] = j
return SA
isLMS = [+(i and isL[i - 1] and not isL[i]) for i in range(n)]
isLMS.append(1)
LMS = [i for i in range(n) if isLMS[i]]
if len(LMS) > 1:
SA = induced_sort(LMS)
LMS2 = [i for i in SA if isLMS[i]]
prev = -1
j = 0
for i in LMS2:
i1 = prev
i2 = i
while prev >= 0 and A[i1] == A[i2]:
i1 += 1
i2 += 1
if isLMS[i1] or isLMS[i2]:
j -= isLMS[i1] and isLMS[i2]
break
j += 1
prev = i
SA[i] = j
LMS = [LMS[i] for i in SAIS([SA[i] for i in LMS])]
return induced_sort(LMS)
def KASAI(A, SA):
"""
Calculates LCP array in O(n) time
Input:
String A and its suffix array SA
"""
n = len(A)
rank = [0] * n
for i in range(n):
rank[SA[i]] = i
LCP = [0] * (n - 1)
k = 0
for i in range(n):
SAind = rank[i]
if SAind == n - 1:
continue
j = SA[SAind + 1]
while i + k < n and A[i + k] == A[j + k]:
k += 1
LCP[SAind] = k
k -= k > 0
return LCP