-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathword_segment.py
96 lines (77 loc) · 3.39 KB
/
word_segment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""
This code is updated version of this: https://gist.github.com/markdtw/e2a4e2ee7cef8ea6aed33bb47a97fba6
Ye Kyaw Thu, LST, NECTEC, Thailand updated followings:
-- added recursion limit
-- changed P_unigram and P_bigram as module level global variable
-- using binary ngram dictionary
-- set N value of this: "def __init__(self, datafile=None, unigram=True, N=102490):"
-- Last Updated: 5 Sept 2021
# References:
- Python implementation of Viterbi algorithm for word segmentation:
- Updated version of this: https://gist.github.com/markdtw/e2a4e2ee7cef8ea6aed33bb47a97fba6
- A clean-up of this: http://norvig.com/ngrams/ch14.pdf
- For recursion limit: https://www.geeksforgeeks.org/python-handling-recursion-limit/
- A. Viterbi, "Error bounds for convolutional codes and an asymptotically optimum decoding algorithm," in IEEE Transactions on Information Theory, vol. 13, no. 2, pp. 260-269, April 1967, doi: 10.1109/TIT.1967.1054010.
"""
import math
import functools
import sys
import pickle
sys.setrecursionlimit(10**6)
global P_unigram
global P_bigram
def read_dict (fileDICT):
try:
with open(fileDICT, 'rb') as input_file:
dictionary = pickle.load(input_file)
input_file.close()
except FileNotFoundError:
print('Dictionary file', fileDICT, ' not found!')
return dictionary
class ProbDist(dict):
### Probability distribution estimated from unigram/bigram data
def __init__(self, datafile=None, unigram=True, N=102490):
#def __init__(self, datafile=None, unigram=True, N=1024908267229):
#def __init__(self, datafile=None, unigram=True, N=8199266137832):
#data = {}
data = read_dict(datafile)
for k, c in data.items():
self[k] = self.get(k, 0) + c
if unigram:
self.unknownprob = lambda k, N: 10/(N*10**len(k)) # avoid unknown long word
else:
self.unknownprob = lambda k, N: 1/N
self.N = N
def __call__(self, key):
if key in self:
return self[key]/self.N
else:
return self.unknownprob(key, self.N)
def conditionalProb(word_curr, word_prev):
### Conditional probability of current word given the previous word.
try:
return P_bigram[word_prev + ' ' + word_curr]/P_unigram[word_prev]
except KeyError:
return P_unigram(word_curr)
@functools.lru_cache(maxsize=2**10)
#maxlen=20
def viterbi(text, prev='<S>', maxlen=20):
if not text:
return 0.0, []
#print("text: ", text)
textlen = min(len(text), maxlen)
splits = [(text[:i + 1], text[i + 1:]) for i in range(textlen)]
candidates = []
#print("clear candidates! candidates = []")
for first_word, remain_word in splits:
#pdb.set_trace()
first_prob = math.log10(conditionalProb(first_word, prev))
#print("first_prob of condProb(", first_word, ", ", prev, "): ", first_prob )
remain_prob, remain_word = viterbi(remain_word, first_word)
#print("remain_prob: ", remain_prob, ", remain_word: ", remain_word)
candidates.append((first_prob + remain_prob, [first_word] + remain_word))
#print("first_prob: ", str(first_prob), ", remain_prob: ", remain_prob, ", [first_word]:", [first_word], ", remain_word: ", remain_word)
#print("Candidates: ", candidates)
#print("max(candidates): " + str(max(candidates)))
#print("====================")
return max(candidates)