forked from Serenitas/topic-modeller
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathngram_adapter.py
193 lines (175 loc) · 7.04 KB
/
ngram_adapter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import adjective_stemmer as stemmer
adj_endings_male_hard = ['ый', 'ого', 'ому', 'ым', 'ом', 'ой']
adj_endings_male_soft = ['ий', 'его', 'ему', 'им', 'ем']
adj_endings_female_hard = ['ая', 'ой', 'ую']
adj_endings_female_soft = ['яя', 'ей', 'юю']
adj_endings_neuter_hard = ['ое', 'ого', 'ому', 'ым']
adj_endings_neuter_soft = ['ее', 'его', 'ему', 'им', 'ем']
def adj_to_gender(adjective, gender):
stem = stemmer.stem(adjective)
ending = adjective[len(stem):]
if gender == 'м':
return adjective
if ending in adj_endings_male_hard:
if gender == 'ж':
return stem + adj_endings_female_hard[0]
return stem + adj_endings_neuter_hard[0]
if ending in adj_endings_male_soft:
if stem[-1:] in ['г', 'к', 'х', 'ц']:
if gender == 'ж':
return stem + adj_endings_female_hard[0]
return stem + adj_endings_neuter_hard[0]
if gender == 'ж':
return stem + adj_endings_female_soft[0]
return stem + adj_endings_neuter_soft[0]
return adjective
def noun_to_genitive(noun, gender):
stem = noun[:-1]
ending = noun[-1:]
if ending == 'ь':
if gender == 'м':
return stem + 'я'
if gender == 'ж':
return stem + 'и'
if ending == 'а':
if stem[-1:] in ['г', 'к', 'х', 'ч', 'щ', 'ш']:
return stem + 'и'
return stem + 'ы'
if ending == 'я':
if stem[-1:] == 'м':
return stem + 'ени'
return stem + 'и'
if ending == 'е' or ending == 'й':
return stem + 'я'
if ending == 'о':
return stem + 'а'
return noun + 'а'
def adj_to_genitive(adjective, gender):
stem = stemmer.stem(adjective)
ending = adjective[len(stem):]
if ending in adj_endings_male_hard:
if gender == 'м':
return stem + adj_endings_male_hard[1]
elif gender == 'ж':
return stem + adj_endings_female_hard[1]
return stem + adj_endings_neuter_hard[1]
if ending in adj_endings_male_soft:
if stem[-1:] in ['г', 'к', 'х', 'ц']:
if gender == 'м':
return stem + adj_endings_male_hard[1]
elif gender == 'ж':
return stem + adj_endings_female_hard[1]
return stem + adj_endings_neuter_hard[1]
if gender == 'м':
return stem + adj_endings_male_soft[1]
elif gender == 'ж':
return stem + adj_endings_female_soft[1]
return stem + adj_endings_neuter_soft[1]
return adjective
def build_noun_dictionary(filename):
dict = {}
file = open(file=filename, encoding='utf-8').readlines()
for str in file:
str = str.strip('\n')
str = str.split('=')
if len(str) < 2:
continue
lemma = str[0]
info = str[1]
if 'S' in info and 'SPRO' not in info:
if 'муж' in info:
dict[lemma] = 'м'
if 'жен' in info:
dict[lemma] = 'ж'
if 'сред' in info:
dict[lemma] = 'с'
return dict
def build_adj_dictionary(filename):
dict = {}
file = open(file=filename, encoding='utf-8').readlines()
for str in file:
str = str.strip('\n')
str = str.split('=')
if len(str) < 2:
continue
lemma = str[0]
info = str[1]
if 'A' in info and 'APRO' not in info:
if 'муж' in info:
dict[lemma] = 'м'
if 'жен' in info:
dict[lemma] = 'ж'
if 'сред' in info:
dict[lemma] = 'с'
return dict
def adapt_ngram(ngram, dictionary, adj_dictionary):
ngram = ngram.split('|')[0]
if ngram.count(' ') == 0:
return ngram
if ngram.count(' ') == 1:
word1 = ngram.split(' ')[0]
word2 = ngram.split(' ')[1]
if word1 in dictionary and word2 in dictionary:
word2 = noun_to_genitive(word2, dictionary[word2])
elif word1 in adj_dictionary and word2 in dictionary:
word1 = adj_to_gender(word1, dictionary[word2])
return word1 + ' ' + word2
if ngram.count(' ') == 2:
word1 = ngram.split(' ')[0]
word2 = ngram.split(' ')[1]
word3 = ngram.split(' ')[2]
if word1 in dictionary and word2 in dictionary and word3 in dictionary:
word2 = noun_to_genitive(word2, dictionary[word2])
word3 = noun_to_genitive(word3, dictionary[word3])
if word1 in dictionary and word2 in adj_dictionary and word3 in dictionary:
word2 = adj_to_genitive(word2, dictionary[word3])
word3 = noun_to_genitive(word3, dictionary[word3])
return word1 + ' ' + word2 + ' ' + word3
def adapt_ngrams(ngrams_file, dict_file, result_file):
dictionary = build_noun_dictionary(dict_file)
adj_dictionary = build_adj_dictionary(dict_file)
ngrams = open(file=ngrams_file, encoding='utf-8').read().split('\n')
adapted_ngrams = []
multiword_only = []
multiword_adapted = []
for ngram in ngrams:
if len(ngram) < 3:
continue
ngram = ngram.split('|')[0]
if ngram.count(' ') == 0:
adapted_ngrams.append(ngram)
if ngram.count(' ') == 1:
multiword_only.append(ngram)
word1 = ngram.split(' ')[0]
word2 = ngram.split(' ')[1]
if word1 in dictionary and word2 in dictionary:
word2 = noun_to_genitive(word2, dictionary[word2])
multiword_adapted.append(word1 + ' ' + word2)
elif word1 in adj_dictionary and word2 in dictionary:
word1 = adj_to_gender(word1, dictionary[word2])
multiword_adapted.append(word1 + ' ' + word2)
adapted_ngrams.append(word1 + ' ' + word2)
if ngram.count(' ') == 2:
multiword_only.append(ngram)
word1 = ngram.split(' ')[0]
word2 = ngram.split(' ')[1]
word3 = ngram.split(' ')[2]
if word1 in dictionary and word2 in dictionary and word3 in dictionary:
word2 = noun_to_genitive(word2, dictionary[word2])
word3 = noun_to_genitive(word3, dictionary[word3])
multiword_adapted.append(word1 + ' ' + word2 + ' ' + word3)
if word1 in dictionary and word2 in adj_dictionary and word3 in dictionary:
word2 = adj_to_genitive(word2, dictionary[word3])
word3 = noun_to_genitive(word3, dictionary[word3])
multiword_adapted.append(word1 + ' ' + word2 + ' ' + word3)
adapted_ngrams.append(word1 + ' ' + word2 + ' ' + word3)
out = open(file=result_file, mode='w', encoding='utf-8')
out2 = open(file='adapted.txt', mode='w', encoding='utf-8')
out3 = open(file='multiword_only.txt', mode='w', encoding='utf-8')
for w in adapted_ngrams:
#print(w)
out.write(w + '\n')
for w in multiword_adapted:
out2.write(w + '\n')
for w in multiword_only:
out3.write(w + '\n')