forked from anlausch/XWEAT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
seEval3.py
150 lines (115 loc) · 5.24 KB
/
seEval3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import codecs
import re
import numpy as np
from scipy import spatial
import os
from joblib import Parallel, delayed
import multiprocessing
from sklearn.metrics import r2_score
file_list = os.listdir('data/vec')
#embedding_path = "data/vec/ara_news_2008_1M-sentencesCleaned.txt.vec"
def load_embedding(path):
embbedding_dict = {}
with codecs.open(path, "rb", "utf8", "ignore") as infile:
for line in infile:
try:
parts = line.split()
word = parts[0]
nums = [float(p) for p in parts[1:]]
embbedding_dict[word] = nums
except Exception as e:
print(line)
continue
return embbedding_dict #
def clean_arabic_str(text):
'''
this method clean strings of arabic, remove tashkeel, and replace double letters and unify ta2 marbuta and ha2
:param text: text: an arabic word
:type text str
:return:text
'''
search = ["أ", "إ", "آ", "ة", "_", "-", "/", ".", "،", " و ", " يا ", '"', "ـ", "'", "ى", "\\", '\n', '\t',
'"', '?', '؟', '!']
replace = ["ا", "ا", "ا", "ه", " ", " ", "", "", "", " و", " يا", "", "", "", "ي", "", ' ', ' ', ' ', ' ? ', ' ؟ ',
' ! ']
# remove tashkeel
p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
text = re.sub(p_tashkeel, "", text)
# remove longation
p_longation = re.compile(r'(.)\1+')
subst = r"\1\1"
text = re.sub(p_longation, subst, text)
text = text.replace('وو', 'و')
text = text.replace('يي', 'ي')
text = text.replace('اا', 'ا')
for i in range(0, len(search)):
text = text.replace(search[i], replace[i])
# trim
text = text.strip()
return text
def create_gold_standard():
gold_standard = []
with open("data/STS.gs.track1.ar-ar.txt" , "r") as sts_results:
for line in sts_results:
line= line.strip()
gold_standard.append(float(line))
return np.array(gold_standard)
def calculate_correlation(result,output):
gold_standard = create_gold_standard()
index_missing_elements = np.argwhere(np.isnan(result))
gold_standard = np.delete(gold_standard,index_missing_elements)
result = np.delete(result, index_missing_elements)
correlation = np.corrcoef(gold_standard, result)
np.savetxt("eval/" + output + "Eval", correlation, fmt='%.18e', header="")
return correlation
def semEval(embedding_path,output):
''':parameter embedding_path
:return correlation, print results to a file
'''
embedding_dict = load_embedding(embedding_path)
result = []
with codecs.open('data/STS.input.track1.ar-ar.txt', 'r', "utf-8") as input: #load track test
for index, line in enumerate(input):
line = line.strip().split("\t") #split into two sentences
sts_1 = line[0].split(" ") # create array with words from sentence one
sts_2 = line[1].split(" ") # create array with words from sentence one
print(index ,"\n",sts_1, "\n" ,sts_2)
sum_embedding_1 = 0.0
sum_embedding_2 = 0.0
number_of_founded_words = 0
for token in sts_1:
token = clean_arabic_str(token).replace(" ", "_") #clean the token to match the training format
token = token.replace(".", "")
try:
embedding_dict[token]
word_embedding = np.array(embedding_dict[token])
sum_embedding_1 = np.add(sum_embedding_1,word_embedding) # add the word vector to the sentence vector
number_of_founded_words +=1
except KeyError as e:
print("not found:" + token)
if number_of_founded_words > 0:
sentence_1_representation = np.divide(sum_embedding_1, number_of_founded_words)
elif number_of_founded_words == 0:
sentence_1_representation = np.divide(sum_embedding_1, 1)
number_of_founded_words = 0
for token in sts_2:
token = clean_arabic_str(token).replace(" ", "_") # clean the token to match the training format
token = token.replace(".", "")
try:
embedding_dict[token]
word_embedding = np.array(embedding_dict[token])
sum_embedding_2 = np.add(sum_embedding_2,word_embedding)
number_of_founded_words += 1
except KeyError as e:
print("not found:" + token)
if number_of_founded_words > 0:
sentence_2_representation = np.divide(sum_embedding_2, number_of_founded_words) # normalize
elif number_of_founded_words == 0:
sentence_2_representation = np.divide(sum_embedding_2, 1)
result.append(float(1 - spatial.distance.cosine(sentence_1_representation, sentence_2_representation)))
result = np.array(result)
return calculate_correlation(result,output)
#result = semEval(embedding_path, embedding_path)
#correlation = calculate_correlation(result)
num_cores = multiprocessing.cpu_count()
Parallel(n_jobs=num_cores)(delayed(semEval)("data/vec/"+file_name,file_name) for file_name in file_list) #for the server