-
Notifications
You must be signed in to change notification settings - Fork 2
/
index_map.py
104 lines (75 loc) · 2.47 KB
/
index_map.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from collections import defaultdict
class IndexMap:
"""Data structure for indexing words by unique ids
It allows retrieve queries in both direction (wrd->idx, and idx->wrd)
"""
def __init__(self, vocabs_file=None):
"""
:param vocabs_file: A string, vocabs file path to load
"""
self.idx = 0 # unique index for each word
self.wrd_to_idx = {}
self.idx_to_wrd = {}
self.wrd_freq = defaultdict(int)
if vocabs_file:
with open(vocabs_file, 'r') as vocabs:
for wrd in vocabs:
self.add_wrd(wrd.strip())
else:
# these symbols already exist in the vocabulary
for wrd in {'<s>', '</s>', '<unk>'}:
self.add_wrd(wrd)
@staticmethod
def get_unk_wrd():
return '<unk>'
@staticmethod
def get_start_wrd():
return '<s>'
@staticmethod
def get_end_wrd():
return '</s>'
def get_unk_id(self):
return self.wrd_to_idx[self.get_unk_wrd()]
def get_start_id(self):
return self.wrd_to_idx[self.get_start_wrd()]
def get_end_id(self):
return self.wrd_to_idx[self.get_end_wrd()]
def add_wrd(self, wrd):
"""Update index maps and increase word's frequency
:param wrd: A string, the input word
"""
if wrd not in self.wrd_to_idx:
self.wrd_to_idx[wrd] = self.idx
self.idx_to_wrd[self.idx] = wrd
self.idx += 1
self.wrd_freq[self.wrd_to_idx[wrd]] += 1
def get_wrd_by_idx(self, idx):
"""Return the word of the given index
:param idx: An int, the index of the given word
:return: A string, <unk> symbol if index does not exist else the word of the given index
"""
if idx not in self.idx_to_wrd:
return self.get_unk_wrd()
return self.idx_to_wrd[idx]
def get_idx_by_wrd(self, wrd):
"""Return the index of the given word
:param wrd: A string, the word of the given index
:return: An integer, -1 if word does not exist else the index of the word
"""
if wrd not in self.wrd_to_idx:
return self.get_unk_id()
return self.wrd_to_idx[wrd]
def get_num_of_words(self):
"""Return the number of unique words
:return: An integer, the number of unique words
"""
return self.idx
def get_wrd_freq(self, idx):
"""Return the frequency of a word given it's index
:param idx: An integer, an index of a word
:return: An integer, the frequency of the given word
"""
return self.wrd_freq[idx]
def get_wrd_freq_items(self):
"""Return word-freq pair list"""
return self.wrd_freq.items()