Skip to content

Commit

Permalink
Fix tests for EuclideanKeyedVectors.similarity_matrix. Fix #1961 (#…
Browse files Browse the repository at this point in the history
…1984)

* Fix unit tests for similarity_matrix property

* Fix unit test to check for ones on diagonals

* Fix the exponent test for the KeyedVectors similarity_matrix method

* In similarity_matrix, process rows in index order when IDFs are the same

* Test that the tfidf parameter has desired effect on similarity_matrix
  • Loading branch information
Witiko authored and menshikh-iv committed Mar 18, 2018
1 parent 47d995a commit a781b40
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 13 deletions.
4 changes: 3 additions & 1 deletion gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,8 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0,
else:
assert max(tfidf.idfs) < matrix_order
word_indices = [
index for index, _ in sorted(tfidf.idfs.items(), key=lambda x: x[1], reverse=True)
index for index, _
in sorted(tfidf.idfs.items(), key=lambda x: (x[1], -x[0]), reverse=True)
]

# Traverse rows.
Expand All @@ -511,6 +512,7 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0,
if w1 not in self.vocab:
num_skipped += 1
continue # A word from the dictionary is not present in the word2vec model.

# Traverse upper triangle columns.
if matrix_order <= nonzero_limit + 1: # Traverse all columns.
columns = (
Expand Down
55 changes: 43 additions & 12 deletions gensim/test/test_keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import numpy as np

from gensim.corpora import Dictionary
from gensim.models import KeyedVectors as EuclideanKeyedVectors
from gensim.models import KeyedVectors as EuclideanKeyedVectors, TfidfModel
from gensim.test.utils import datapath


Expand All @@ -27,33 +27,64 @@ def setUp(self):
self.vectors = EuclideanKeyedVectors.load_word2vec_format(
datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)

def similarity_matrix(self):
def test_similarity_matrix(self):
"""Test similarity_matrix returns expected results."""

corpus = [["government", "denied", "holiday"], ["holiday", "slowing", "hollingworth"]]
dictionary = Dictionary(corpus)
corpus = [dictionary.doc2bow(document) for document in corpus]
documents = [["government", "denied", "holiday"],
["holiday", "slowing", "hollingworth"]]
dictionary = Dictionary(documents)

# checking symmetry and the existence of ones on the diagonal
similarity_matrix = self.similarity_matrix(corpus, dictionary).todense()
similarity_matrix = self.vectors.similarity_matrix(dictionary).todense()
self.assertTrue((similarity_matrix.T == similarity_matrix).all())
self.assertTrue((np.diag(similarity_matrix) == similarity_matrix).all())
self.assertTrue(
(np.diag(similarity_matrix) ==
np.ones(similarity_matrix.shape[0])).all())

# checking that thresholding works as expected
similarity_matrix = self.similarity_matrix(corpus, dictionary, threshold=0.45).todense()
similarity_matrix = self.vectors.similarity_matrix(dictionary, threshold=0.45).todense()
self.assertEquals(18, np.sum(similarity_matrix == 0))

# checking that exponent works as expected
similarity_matrix = self.similarity_matrix(corpus, dictionary, exponent=1.0).todense()
self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix))
similarity_matrix = self.vectors.similarity_matrix(dictionary, exponent=1.0).todense()
self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix), places=5)

# checking that nonzero_limit works as expected
similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=4).todense()
similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=4).todense()
self.assertEquals(4, np.sum(similarity_matrix == 0))

similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=3).todense()
similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=3).todense()
self.assertEquals(20, np.sum(similarity_matrix == 0))

# check that processing rows in the order given by IDF has desired effect

# The complete similarity matrix we would obtain with nonzero_limit would look as follows:
documents = [["honour", "understanding"], ["understanding", "mean", "knop"]]
dictionary = Dictionary(documents)
tfidf = TfidfModel(dictionary=dictionary)

# All terms except for "understanding" have IDF of log2(2 / 1) = log2(2) = 1.
# The term "understanding" has IDF of log2(2 / 2) = log2(1) = 0.
#
# If we do not pass the tfidf parameter to the similarity_matrix
# method, then we process rows in the order from 1 to 4. If we do pass
# the tfidf parameter to the similarity_matrix method, then we first
# process the rows 1, 3, 4 that correspond to terms with IDF of 1.0 and
# then the row 2 that corresponds to the term "understanding" with IDF
# of 0. Since the method is greedy, we will end up with two different
# similarity matrices.

similarity_matrix = self.vectors.similarity_matrix(
dictionary, nonzero_limit=2).todense()
self.assertTrue(np.all(np.isclose(similarity_matrix, np.array([
[1, 0.9348248, 0, 0], [0.9348248, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]))))

similarity_matrix = self.vectors.similarity_matrix(
dictionary, tfidf, nonzero_limit=2).todense()
self.assertTrue(np.all(np.isclose(similarity_matrix, np.array([
[1, 0.9348248, 0, 0.9112908], [0.9348248, 1, 0.90007025, 0], [0, 0.90007025, 1, 0],
[0.9112908, 0, 0, 1]]))))

def test_most_similar(self):
"""Test most_similar returns expected results."""
expected = [
Expand Down

0 comments on commit a781b40

Please sign in to comment.