-
Notifications
You must be signed in to change notification settings - Fork 4
/
evaluate.py
163 lines (128 loc) · 5.28 KB
/
evaluate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: utf-8 -*-
import ConfigParser
import numpy
import codecs
import sys
import time
import random
import math
import os
from copy import deepcopy
import json
from numpy.linalg import norm
from numpy import dot
from scipy.stats import spearmanr
lp_map = {}
lp_map["english"] = u"en_"
lp_map["german"] = u"de_"
lp_map["italian"] = u"it_"
lp_map["russian"] = u"ru_"
def normalise_word_vectors(word_vectors, norm=1.0):
"""
This method normalises the collection of word vectors provided in the word_vectors dictionary.
"""
for word in word_vectors:
word_vectors[word] /= math.sqrt((word_vectors[word]**2).sum() + 1e-6)
word_vectors[word] = word_vectors[word] * norm
return word_vectors
def load_word_vectors(file_destination, language):
"""
This method loads the word vectors from the supplied file destination.
It loads the dictionary of word vectors and prints its size and the vector dimensionality.
"""
print "Loading word vectors from", file_destination
word_dictionary = {}
try:
f = codecs.open(file_destination, 'r', 'utf-8')
for line in f:
line = line.split(" ", 1)
key = line[0].lower()
if lp_map[language] not in key:
key = lp_map[language] + key
try:
transformed_key = unicode(key)
except:
print "CANT LOAD", transformed_key
word_dictionary[transformed_key] = numpy.fromstring(line[1], dtype="float32", sep=" ")
except:
print "Word vectors could not be loaded from:", file_destination
return {}
print len(word_dictionary), "vectors loaded from", file_destination
return normalise_word_vectors(word_dictionary)
def distance(v1, v2, normalised_vectors=False):
"""
Returns the cosine distance between two vectors.
If the vectors are normalised, there is no need for the denominator, which is always one.
"""
if normalised_vectors:
return 1 - dot(v1, v2)
else:
return 1 - dot(v1, v2) / ( norm(v1) * norm(v2) )
def simlex_analysis(word_vectors, language="german", source="simlex"):
"""
This method computes the Spearman's rho correlation (with p-value) of the supplied word vectors.
The method also prints the gold standard SimLex-999 ranking to results/simlex_ranking.txt,
and the ranking produced using the counter-fitted vectors to results/counter_ranking.txt
"""
pair_list = []
if source == "simlex":
fread_simlex=codecs.open("evaluation/simlex-" + language + ".txt", 'r', 'utf-8')
else:
fread_simlex=codecs.open("evaluation/ws-353/wordsim353-" + source + ".txt", 'r', 'utf-8') # specify english, english-rel, etc.
line_number = 0
for line in fread_simlex:
if line_number > 0:
tokens = line.split()
word_i = tokens[0].lower()
word_j = tokens[1].lower()
score = float(tokens[2])
word_i = lp_map[language] + word_i
word_j = lp_map[language] + word_j
if word_i in word_vectors and word_j in word_vectors:
pair_list.append( ((word_i, word_j), score) )
else:
pass
line_number += 1
pair_list.sort(key=lambda x: - x[1])
coverage = len(pair_list)
extracted_list = []
extracted_scores = {}
for (x,y) in pair_list:
(word_i, word_j) = x
current_distance = distance(word_vectors[word_i], word_vectors[word_j])
extracted_scores[(word_i, word_j)] = current_distance
extracted_list.append(((word_i, word_j), current_distance))
extracted_list.sort(key=lambda x: x[1])
spearman_original_list = []
spearman_target_list = []
for position_1, (word_pair, score_1) in enumerate(pair_list):
score_2 = extracted_scores[word_pair]
position_2 = extracted_list.index((word_pair, score_2))
spearman_original_list.append(position_1)
spearman_target_list.append(position_2)
spearman_rho = spearmanr(spearman_original_list, spearman_target_list)
return round(spearman_rho[0], 3), coverage
def main():
"""
The user can provide the location of the config file as an argument.
If no location is specified, the default config file (experiment_parameters.cfg) is used.
"""
try:
word_vector_location = sys.argv[1]
language = sys.argv[2]
word_vectors = load_word_vectors(word_vector_location, language)
except:
print "USAGE: python code/simlex_evaluation.py word_vector_location language"
return
print "\n============= Evaluating word vectors for language:", language, " =============\n"
simlex_score, simlex_coverage = simlex_analysis(word_vectors, language)
print "SimLex-999 score and coverage:", simlex_score, simlex_coverage
# WordSim Validation scores:
c1, cov1 = simlex_analysis(word_vectors,language, source=language)
c2, cov2 = simlex_analysis(word_vectors,language, source=language + "-sim")
c3, cov3 = simlex_analysis(word_vectors,language, source=language + "-rel")
print "WordSim overall score and coverage:", c1, cov1
print "WordSim Similarity score and coverage:", c2, cov2
print "WordSim Relatedness score and coverage:", c3, cov3, "\n"
if __name__=='__main__':
main()