Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

taking into account lower for similarity dataset and adding SCWS dataset #30

Merged
merged 1 commit into from
Jul 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 54 additions & 15 deletions sinr/text/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import urllib.request
import os
from tqdm.auto import tqdm

def fetch_data_MEN():
"""Fetch MEN dataset for testing relatedness similarity
Expand Down Expand Up @@ -65,7 +66,32 @@ def fetch_data_WS353():

return data

def eval_similarity(sinr_vec, dataset):
def fetch_data_SCWS():
"""Fetch SCWS dataset for testing relatedness similarity

:return: dictionary-like object. Keys of interest:
'X': matrix of 2 words per column,
'y': vector with scores
:rtype: sklearn.datasets.base.Bunch

"""

file = open('dataset.txt','wb')

with urllib.request.urlopen('https://raw.githubusercontent.com/jjlastra/HESML/master/HESML_Library/WN_Datasets/SCWS1994_dataset.csv') as response:
file.write(response.read())

file.close()

data = pd.read_csv('dataset.txt', header=None, sep=";")

os.remove('dataset.txt')

data = Bunch(X=data.values[:, 0:2].astype("object"), y=(data.values[:, 2:].astype(float) / 5.0).ravel())

return data

def eval_similarity(sinr_vec, dataset, print_missing=True):
"""Evaluate similarity with Spearman correlation

:param sinr_vec: SINrVectors object
Expand All @@ -74,6 +100,8 @@ def eval_similarity(sinr_vec, dataset):
dictionary-like object. Keys of interest:
'X': matrix of 2 words per column,
'y': vector with scores

:param print_missing: boolean (default : True)

:return: Spearman correlation between cosine similarity and human rated similarity
:rtype: float
Expand All @@ -84,47 +112,58 @@ def eval_similarity(sinr_vec, dataset):
cosine_sim = list()

vocab = sinr_vec.vocab
missing_words = 0
missing_words = list()

# Mean vector
vec_mean = np.ravel(sinr_vec.vectors.mean(axis=0))

for i in range(len(dataset.X)):
for i in tqdm(range(len(dataset.X)), desc = 'eval similarity', leave = False):

# Words into vectors
# Missing words replaced by mean vector

if dataset.X[i][0] not in vocab:
vec1 = vec_mean
missing_words += 1
if dataset.X[i][0].lower() in vocab:
vec1 = sinr_vec._get_vector(sinr_vec._get_index(dataset.X[i][0].lower()))
else:
vec1 = vec_mean
if dataset.X[i][0] not in missing_words:
missing_words.append(dataset.X[i][0])
else:
vec1 = sinr_vec._get_vector(sinr_vec._get_index(dataset.X[i][0]))

if dataset.X[i][1] not in vocab:
vec2 = vec_mean
missing_words += 1
if dataset.X[i][1].lower() in vocab:
vec2 = sinr_vec._get_vector(sinr_vec._get_index(dataset.X[i][1].lower()))
else:
vec2 = vec_mean
if dataset.X[i][1] not in missing_words:
missing_words.append(dataset.X[i][0])
else:
vec2 = sinr_vec._get_vector(sinr_vec._get_index(dataset.X[i][1]))

# Cosine similarity
cosine_sim.append(np.dot(vec1,vec2)/(norm(vec1)*norm(vec2)))
scores.append(dataset.y[i])

print(str(missing_words) + ' missing words')
if print_missing == True:
print(str(len(missing_words)) + ' missing words')

return scipy.stats.spearmanr(cosine_sim, scores).correlation

def similarity_MEN_WS353(sinr_vec):
"""Evaluate similarity with MEN and WS353 datasets
def similarity_MEN_WS353_SCWS(sinr_vec, print_missing=True):
"""Evaluate similarity with MEN, WS353 and SCWS datasets

:param sinr_vec: SINrVectors object

:param print_missing: boolean (default : True)

:return: Spearman correlation for MEN and WS353 datasets
:return: Spearman correlation for MEN, WS353 and SCWS datasets
:rtype: dict

"""

sim_MEN = eval_similarity(sinr_vec, fetch_data_MEN())
sim_WS353 = eval_similarity(sinr_vec, fetch_data_WS353())
sim_MEN = eval_similarity(sinr_vec, fetch_data_MEN(), print_missing=print_missing)
sim_WS353 = eval_similarity(sinr_vec, fetch_data_WS353(), print_missing=print_missing)
sim_SCWS = eval_similarity(sinr_vec, fetch_data_SCWS(), print_missing=print_missing)

return {"MEN": sim_MEN, "WS353" : sim_WS353}
return {"MEN": sim_MEN, "WS353" : sim_WS353, "SCWS" : sim_SCWS}
13 changes: 7 additions & 6 deletions tests/test_sinr_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import unittest

import sinr.graph_embeddings as ge
from sinr.text.evaluate import fetch_data_MEN, fetch_data_WS353, eval_similarity, similarity_MEN_WS353
from sinr.text.evaluate import fetch_data_MEN, fetch_data_WS353, eval_similarity, similarity_MEN_WS353_SCWS
import urllib.request
import os

Expand Down Expand Up @@ -36,12 +36,13 @@ def tearDown(self):

def test_eval_similarity(self):
res = round(eval_similarity(self.vectors, fetch_data_MEN()), 2)
self.assertEqual(res, 0.39)
self.assertGreater(res, 0.38)

def test_similarity_MEN_WS353(self):
res = similarity_MEN_WS353(self.vectors)
self.assertEqual(round(res["MEN"],2), 0.39)
self.assertEqual(round(res["WS353"],2), 0.44)
def test_similarity_MEN_WS353_SCWS(self):
res = similarity_MEN_WS353_SCWS(self.vectors)
self.assertGreater(round(res["MEN"],2), 0.38)
self.assertGreater(round(res["WS353"],2), 0.40)
self.assertGreater(round(res["SCWS"],2), 0.38)

if __name__ == '__main__':
unittest.main()