Fix tests for EuclideanKeyedVectors.similarity_matrix. Fix #1961 (#…

…1984) * Fix unit tests for similarity_matrix property * Fix unit test to check for ones on diagonals * Fix the exponent test for the KeyedVectors similarity_matrix method * In similarity_matrix, process rows in index order when IDFs are the same * Test that the tfidf parameter has desired effect on similarity_matrix
piskvorky · Mar 18, 2018 · a781b40 · a781b40
1 parent 47d995a
commit a781b40
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 13 deletions.
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -497,7 +497,8 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0,
         else:
             assert max(tfidf.idfs) < matrix_order
             word_indices = [
-                index for index, _ in sorted(tfidf.idfs.items(), key=lambda x: x[1], reverse=True)
+                index for index, _
+                in sorted(tfidf.idfs.items(), key=lambda x: (x[1], -x[0]), reverse=True)
             ]
 
         # Traverse rows.
@@ -511,6 +512,7 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0,
             if w1 not in self.vocab:
                 num_skipped += 1
                 continue  # A word from the dictionary is not present in the word2vec model.
+
             # Traverse upper triangle columns.
             if matrix_order <= nonzero_limit + 1:  # Traverse all columns.
                 columns = (

diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
@@ -15,7 +15,7 @@
 import numpy as np
 
 from gensim.corpora import Dictionary
-from gensim.models import KeyedVectors as EuclideanKeyedVectors
+from gensim.models import KeyedVectors as EuclideanKeyedVectors, TfidfModel
 from gensim.test.utils import datapath
 
 
@@ -27,33 +27,64 @@ def setUp(self):
         self.vectors = EuclideanKeyedVectors.load_word2vec_format(
             datapath('euclidean_vectors.bin'), binary=True, datatype=np.float64)
 
-    def similarity_matrix(self):
+    def test_similarity_matrix(self):
         """Test similarity_matrix returns expected results."""
 
-        corpus = [["government", "denied", "holiday"], ["holiday", "slowing", "hollingworth"]]
-        dictionary = Dictionary(corpus)
-        corpus = [dictionary.doc2bow(document) for document in corpus]
+        documents = [["government", "denied", "holiday"],
+                  ["holiday", "slowing", "hollingworth"]]
+        dictionary = Dictionary(documents)
 
         # checking symmetry and the existence of ones on the diagonal
-        similarity_matrix = self.similarity_matrix(corpus, dictionary).todense()
+        similarity_matrix = self.vectors.similarity_matrix(dictionary).todense()
         self.assertTrue((similarity_matrix.T == similarity_matrix).all())
-        self.assertTrue((np.diag(similarity_matrix) == similarity_matrix).all())
+        self.assertTrue(
+            (np.diag(similarity_matrix) ==
+             np.ones(similarity_matrix.shape[0])).all())
 
         # checking that thresholding works as expected
-        similarity_matrix = self.similarity_matrix(corpus, dictionary, threshold=0.45).todense()
+        similarity_matrix = self.vectors.similarity_matrix(dictionary, threshold=0.45).todense()
         self.assertEquals(18, np.sum(similarity_matrix == 0))
 
         # checking that exponent works as expected
-        similarity_matrix = self.similarity_matrix(corpus, dictionary, exponent=1.0).todense()
-        self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix))
+        similarity_matrix = self.vectors.similarity_matrix(dictionary, exponent=1.0).todense()
+        self.assertAlmostEqual(9.5788956, np.sum(similarity_matrix), places=5)
 
         # checking that nonzero_limit works as expected
-        similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=4).todense()
+        similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=4).todense()
         self.assertEquals(4, np.sum(similarity_matrix == 0))
 
-        similarity_matrix = self.similarity_matrix(corpus, dictionary, nonzero_limit=3).todense()
+        similarity_matrix = self.vectors.similarity_matrix(dictionary, nonzero_limit=3).todense()
         self.assertEquals(20, np.sum(similarity_matrix == 0))
 
+        # check that processing rows in the order given by IDF has desired effect
+
+        # The complete similarity matrix we would obtain with nonzero_limit would look as follows:
+        documents = [["honour", "understanding"], ["understanding", "mean", "knop"]]
+        dictionary = Dictionary(documents)
+        tfidf = TfidfModel(dictionary=dictionary)
+
+        # All terms except for "understanding" have IDF of log2(2 / 1) = log2(2) = 1.
+        # The term "understanding" has IDF of log2(2 / 2) = log2(1) = 0.
+        #
+        # If we do not pass the tfidf parameter to the similarity_matrix
+        # method, then we process rows in the order from 1 to 4. If we do pass
+        # the tfidf parameter to the similarity_matrix method, then we first
+        # process the rows 1, 3, 4 that correspond to terms with IDF of 1.0 and
+        # then the row 2 that corresponds to the term "understanding" with IDF
+        # of 0. Since the method is greedy, we will end up with two different
+        # similarity matrices.
+
+        similarity_matrix = self.vectors.similarity_matrix(
+            dictionary, nonzero_limit=2).todense()
+        self.assertTrue(np.all(np.isclose(similarity_matrix, np.array([
+            [1, 0.9348248, 0, 0], [0.9348248, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]]))))
+
+        similarity_matrix = self.vectors.similarity_matrix(
+            dictionary, tfidf, nonzero_limit=2).todense()
+        self.assertTrue(np.all(np.isclose(similarity_matrix, np.array([
+            [1, 0.9348248, 0, 0.9112908], [0.9348248, 1, 0.90007025, 0], [0, 0.90007025, 1, 0],
+            [0.9112908, 0, 0, 1]]))))
+
     def test_most_similar(self):
         """Test most_similar returns expected results."""
         expected = [