forked from anlausch/XWEAT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsemEval2.py
300 lines (239 loc) · 10.5 KB
/
semEval2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import codecs
import re
import numpy as np
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import os
#from joblib import Parallel, delayed
#import multiprocessing
from sklearn.metrics import r2_score
def load_embedding(path):
embbedding_dict = {}
with codecs.open(path, "rb", "utf8", "ignore") as infile:
for line in infile:
try:
parts = line.split()
word = parts[0]
nums = [float(p) for p in parts[1:]]
embbedding_dict[word] = nums
except Exception as e:
print(line)
continue
return embbedding_dict #
def clean_arabic_str(text):
'''
this method clean strings of arabic, remove tashkeel, and replace double letters and unify ta2 marbuta and ha2
:param text: text: an arabic word
:type text str
:return:text
'''
search = ["أ", "إ", "آ", "ة", "_", "-", "/", ".", "،", " و ", " يا ", '"', "ـ", "'", "ى", "\\", '\n', '\t',
'"', '?', '؟', '!']
replace = ["ا", "ا", "ا", "ه", " ", " ", "", "", "", " و", " يا", "", "", "", "ي", "", ' ', ' ', ' ', ' ? ', ' ؟ ',
' ! ']
# remove tashkeel
p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
text = re.sub(p_tashkeel, "", text)
# remove longation
p_longation = re.compile(r'(.)\1+')
subst = r"\1\1"
text = re.sub(p_longation, subst, text)
text = text.replace('وو', 'و')
text = text.replace('يي', 'ي')
text = text.replace('اا', 'ا')
for i in range(0, len(search)):
text = text.replace(search[i], replace[i])
# trim
text = text.strip()
return text
def refactor():
''':returns sts_1 list of sentences 1
:return sts_2 list of senteces 2
:return list of gold standard
'''
gold_standard = []
sts_1 = []
sts_2 = []
with open("data/STS.gs.track1.ar-ar.txt" , "r") as sts_results:
for line in sts_results:
line= line.strip()
gold_standard.append(float(line))
with codecs.open('data/STS.input.track1.ar-ar.txt', 'r', "utf-8-sig", ) as input:
for line in input:
line = line.strip().split("\t")
sts_1.append(line[0].split(" "))
sts_2.append(line[1].split(" "))
for i, sentence in enumerate(sts_1):
for k, word in enumerate(sentence):
word = clean_arabic_str(word).replace(" ", "_") # clean the token to match the training format
word = word.replace(".", "")
sts_1[i][k] = word.strip()
for i, sentence in enumerate(sts_2):
for k, word in enumerate(sentence):
word = clean_arabic_str(word).replace(" ", "_") # clean the token to match the training format
word = word.replace(".", "")
sts_2[i][k] = word.strip()
return sts_1,sts_2,gold_standard
def build_mattrix(sts_1, sts_2, embedding_dict): #:todo: figure out how to delete empty lists
''' :input: sts_1 sts_2 are two list of sentences from the similarity evaluation task1
embedding_dic: a dictionary of word vector representation (loaded)
:output sts_1_embedding, sts_2_embedding: lists contains converted sentences to embeddings where every element
is a list of founded word embeddings.
sentence 66 was deleted as there is no representation of it
'''
sts_1_embeddings = []
sts_2_embeddings = []
sentence_embedding_1 = []
sentence_embedding_2 = []
for i, sentence in enumerate(sts_1):
for k, word in enumerate(sentence):
try:
sentence_embedding_1.append(list(embedding_dict[word]))
except KeyError:
print("not found:" + word)
sts_1_embeddings.append(sentence_embedding_1)
sentence_embedding_1 = []
for i, sentence in enumerate(sts_2):
for k, word in enumerate(sentence):
try:
sentence_embedding_2.append(list(embedding_dict[word]))
except KeyError:
print("not found:" + word)
sts_2_embeddings.append(sentence_embedding_2)
sentence_embedding_2 = []
sts_1_embeddings = np.array(sts_1_embeddings)
sts_1_embeddings = np.delete(sts_1_embeddings,66)
sts_2_embeddings = np.array(sts_2_embeddings)
sts_2_embeddings = np.delete(sts_2_embeddings,66)
return averaged_sentence_representaion (sts_1_embeddings, sts_2_embeddings)
def averaged_sentence_representaion(sts_1_embeddings, sts_2_embeddings):
'''
creates one embedding per sentence
:returns sts_1_embeddings list of embeddings representation of all the sentences 1 normalized by the average
length of the founded words
sts_2_embeddings: same as above except for the second part of the sentence
'''
sentence_1_rep = []
sentence_2_rep = []
sum = 0.0
for sentencelist in sts_1_embeddings: #get the sentence
for word_embd in sentencelist: #get the word embedding of every word in the senentece
word_embd = np.array(word_embd)
word_embd = np.divide(word_embd, len(sentencelist)) #normalize by the length of the sentece (founded words)
sum = np.add(sum,word_embd) #aggrigate with the previouse
sentence_1_rep.append(sum) #append a representation of the sentence
sum = 0.0
for sentencelist in sts_2_embeddings:
for word_embd in sentencelist:
word_embd = np.array(word_embd)
word_embd = np.divide(word_embd, len(sentencelist))
sum = np.add(sum,word_embd)
sentence_2_rep.append(sum)
sentence_1_rep = np.array(sentence_1_rep)
sentence_2_rep = np.array(sentence_2_rep)
return ( sentence_1_rep,sentence_2_rep)
def compute_cosine(sentence_reprentation_1, sentence_representation_2):
result =[]
''' :return result: list of cosine similarity between two sentence representaion'''
for i in zip(sentence_reprentation_1, sentence_representation_2):
result.append((1 - spatial.distance.cosine(i[0], i[1])))
return result
def compute_correlation(result, gold_standard):
gold_standard = np.delete(gold_standard, 66)
correlation = np.corrcoef(gold_standard, result) #
#np.savetxt("eval/" + output + "Eval", correlation, fmt='%.18e', header="")
return correlation
#
# def semEval(embedding_path,output):
#
# ''':parameter embedding_path
# :return correlation, print results to a file
#
# '''
#
#
#
# embedding_dict = load_embedding(embedding_path)
# result = []
# gold_standard = []
#
# with open("data/STS.gs.track1.ar-ar.txt" , "r") as sts_results:
# for line in sts_results:
# line= line.strip()
# gold_standard.append(float(line))
#
# with codecs.open('data/STS.input.track1.ar-ar.txt', 'r', "utf-8-sig",) as input: #load track test
# for index, line in enumerate(input):
# line = line.strip().split("\t") #split into two sentences
# #sts_1 = line[0].replace(".","").split(" ")
# #sts_2 = line[1].replace(".", "").split(" ")
# sts_1 = line[0].split(" ") # create array with words from sentence one
# sts_2 = line[1].split(" ") # create array with words from sentence one
# sum_embedding_1 = 0.0
# sum_embedding_2 = 0.0
# length_found = 0
#
#
#
#
# for m, token in enumerate(sts_1):
# try:
# embedding_dict[token]
# word_embedding = np.array(embedding_dict[token])
# word_embedding = np.divide(word_embedding, length_found) #normalizing the word embedding by the length of the sentence
# sum_embedding_1 = np.add(sum_embedding_1,word_embedding) # add the word vector to the sentence vector
# except KeyError as e:
# print("not found:" + token)
# #sts_1_score = np.divide(sum_embedding_2, len(sts_2)) # not needed anymore
# length_found = 0
# for token in sts_2:
# if token in embedding_dict:
# length_found += 1
# if length_found == 0:
# length_found = 1
#
# for token in sts_2:
# try:
# embedding_dict[token]
# word_embedding = np.array(embedding_dict[token])
# word_embedding = np.divide(word_embedding,length_found)
# sum_embedding_2 = np.add(sum_embedding_2, word_embedding) # same as above
# except KeyError as e:
# print("not found:" + token)
# #sts_2_score = np.divide(sum_embedding_2, len(sts_2)) #not needed anymore
# result.append(float(1 - spatial.distance.cosine(sum_embedding_1, sum_embedding_2)))
# #result = [result.append(float(i)) for i in result]
# index_missing_elements = np.argwhere(np.isnan(result))
# print(index_missing_elements)
# for a in index_missing_elements:
# print(int(a))
# del result[int(a)] # there is a sentance causing nan value, so i deleted it
# del gold_standard[int(a)] #delete the missing from the gold standard
#
# result = np.array(result)
# print(result)
# gold_standard = np.array(gold_standard)
# print(np.argwhere(np.isnan(result)))
#
# #print(r2_score(gold_standard, result))
# correlation = np.corrcoef(gold_standard, result)
# print(correlation)#
#
# np.savetxt("eval/"+output+"Eval",correlation,fmt='%.18e',header="")
# return correlation
#
#semEval("data/vec/"+"ara_news_2010_300K-sentencesCleaned.txt.vec","55ara_news_2008_1M-sentencesCleaned.txt.vec")
#num_cores = multiprocessing.cpu_count()
#Parallel(n_jobs=num_cores)(delayed(semEval)("data/vec/"+file_name,file_name) for file_name in file_list) #for the server
# file_list = os.listdir('data/vec')
def run(embedding_path):
path_prefix = "data/vec/"
embedding_dict = load_embedding()
pass
#listofembedidngs = ["ara_news_2008_1M-sentencesCleaned.txt.vec"]
embedding_path = "data/vec/years_concatinated.vec"
embedding_dict = load_embedding(embedding_path)
sts_1, sts_2, gold_standard = refactor()
sts_1_embeddings, sts_2_embeddings = build_mattrix(sts_1, sts_2, embedding_dict)
result = compute_cosine( sts_1_embeddings, sts_2_embeddings)
result = compute_correlation(result, gold_standard)