-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstring_matching.py
276 lines (249 loc) · 8.77 KB
/
string_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
"""
@version: python3.6
@author: Fieldhunter
@contact: 1677532160yuan@gmail.com
@time: 2020-05-03
"""
def BF(main, pattern):
main_length = len(main) - 1
pattern_length = len(pattern) - 1
if main_length < pattern_length:
main, pattern = pattern, main
main_length, pattern_length = pattern_length, main_length
index = 0
find = False
while index <= main_length - pattern_length and find == False:
main_pointer = index
pattern_pointer = 0
while pattern_pointer <= pattern_length:
if main[main_pointer] == pattern[pattern_pointer]:
main_pointer += 1
pattern_pointer += 1
else:
break
if pattern_pointer > pattern_length:
find = True
else:
index += 1
if find:
print("Find target string")
else:
print("No target string")
def RK(main, pattern):
def hash_function(string):
sumer = 0
for i in string:
sumer += ord(i)
return sumer
main_length = len(main) - 1
pattern_length = len(pattern) - 1
if main_length < pattern_length:
main,pattern = pattern,main
main_length, pattern_length = pattern_length, main_length
pattern_hash = hash_function(pattern)
index = 0
find = False
while index <= main_length - pattern_length and find == False:
main_hash = hash_function(main[index : index+pattern_length+1])
if main_hash == pattern_hash:
main_pointer = index
pattern_pointer = 0
while pattern_pointer <= pattern_length:
if main[main_pointer] == pattern[pattern_pointer]:
main_pointer += 1
pattern_pointer += 1
else:
break
if pattern_pointer > pattern_length:
find = True
else:
index += 1
else:
index += 1
if find:
print("Find target string")
else:
print("No target string")
def BM(main, pattern):
# Find the last element less than the target.
def binary_search(data_list, target):
num = len(data_list)
low = 0
high = num - 1
find = False
while low <= high:
middle_num = low + (high-low) // 2
if data_list[middle_num] >= target:
high = middle_num - 1
elif middle_num == num - 1 or data_list[middle_num+1] >= target:
find = True
break
else:
low = middle_num + 1
if find:
return data_list[middle_num]
else:
return -1
def bad_character_rule(
character_pos,
main,
pattern,
main_pointer,
pattern_pointer):
bad_character = main[main_pointer]
si = pattern_pointer
xi = -1
if character_pos.get(ord(bad_character), False):
pos_list = character_pos[ord(bad_character)]
xi = binary_search(pos_list, pattern_pointer)
bad_character_move = si - xi
return bad_character_move
def good_suffix_rule(
main,
pattern,
main_pointer,
pattern_pointer,
pattern_length,
suffix,
prefix):
good_suffix = pattern[pattern_pointer+1 : ]
len_good_suffix = len(good_suffix)
if len_good_suffix == 0:
return 0
if suffix[len_good_suffix] != -1:
return pattern_pointer - suffix[len_good_suffix] + 1
else:
k = pattern_pointer - 1
while k >= 0:
if prefix[k] == True:
return pattern_length - k + 1
k -= 1
return pattern_length + 1
main_length = len(main) - 1
pattern_length = len(pattern) - 1
if main_length < pattern_length:
main, pattern = pattern, main
main_length, pattern_length = pattern_length, main_length
"""
The location of each character in the pattern string is stored to
greatly improve the search efficiency of bad characters.
"""
character_pos = {}
index = 0
for i in pattern:
character_ascll = ord(i)
if not character_pos.get(character_ascll, False):
character_pos[character_ascll] = [index]
else:
character_pos[character_ascll].append(index)
index += 1
"""
Suffix array is used to store the starting subscript value of
the same substring in the pattern string as the suffix substring
of the length (i.e. subscript).
Prefix array is used to record whether the suffix substring of
the length (i.e. subscript) pattern string can match the prefix substring
of the pattern string.
Both the suffix and prefix arrays start at the position marked 1,
and the position marked 0 is not counted. The subscript means the number
of characters in the substring.
"""
suffix = [-1] * (pattern_length+1)
prefix = [False] * (pattern_length+1)
for i in range(pattern_length):
j = i
k = 0
while j >= 0 and pattern[pattern_length-k] == pattern[j]:
k += 1
suffix[k] = j
j -= 1
if j < 0:
prefix[k] = True
index = 0
find = False
while index <= main_length - pattern_length and find == False:
main_pointer = index + pattern_length
pattern_pointer = pattern_length
while pattern_pointer >= 0:
if main[main_pointer] == pattern[pattern_pointer]:
main_pointer -= 1
pattern_pointer -= 1
else:
"""
get the number of moving steps from the bad character principle
and the good suffix principle at the same time.
"""
bad_character_move = bad_character_rule(character_pos,
main,
pattern,
main_pointer,
pattern_pointer)
good_suffix_move = good_suffix_rule(main,
pattern,
main_pointer,
pattern_pointer,
pattern_length,
suffix,
prefix)
break
if pattern_pointer < 0:
find = True
break
else:
"""
Choose multiple moving steps among bad character principle
and good suffix principle.
"""
if bad_character_move > good_suffix_move:
index += bad_character_move
else:
index += good_suffix_move
if find:
print("Find target string")
else:
print("No target string")
def KMP(main, pattern):
main_length = len(main) - 1
pattern_length = len(pattern) - 1
if main_length < pattern_length:
main, pattern = pattern, main
main_length, pattern_length = pattern_length, main_length
"""
The next_list array is used to store the subscript of the longest
matching prefix substring end character of all length prefix substrings
in the pattern string.
The array subscript represents the length of the prefix substring.
"""
next_list = [-1] * pattern_length
k = -1
for i in range(1, pattern_length):
while k != -1 and pattern[k+1] != pattern[i]:
k = next_list[k]
if pattern[k+1] == pattern[i]:
k += 1
next_list[i] = k
index = 0
find = False
while index <= main_length - pattern_length and find == False:
main_pointer = index
pattern_pointer = 0
while pattern_pointer <= pattern_length:
if main[main_pointer] == pattern[pattern_pointer]:
main_pointer += 1
pattern_pointer += 1
else:
# good prefix rule
good_prefix = pattern[ : pattern_pointer]
len_good_prefix = len(good_prefix) - 1
if len_good_prefix == -1:
index += 1
else:
index += pattern_pointer - next_list[len_good_prefix] - 1
break
if pattern_pointer > pattern_length:
find = True
break
if find:
print("Find target string")
else:
print("No target string")