Add sparse input support in interfaces_getitem when num_best is not N…

…one. Fix #1294 (#1321) * added any2sparse_clipped() function * changed full2sparse_clipped to any2sparse_clipped in __getitem__ * added missing whitespace * return topn from any2sparse_clipped() * efficient any2sparse_clipped implementation * added unit test for any2sparse_clipped * function call corrected * removed any2sparse_clipped and added scipy2scipy_clipped * added new code path for maintain_sparsity * added unit tests for new function and issue * fixed flake8 errors * fixed matrix_indptr * added requested changes * replaced hasattr with getattr * call abs() once for entire matrix in scipy2scipy_clipped * removed matrix.sort_indices and removed indptr while calling argsort
piskvorky · Jun 22, 2017 · dfd7da4 · dfd7da4
1 parent 0d47a6f
commit dfd7da4
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 0 deletions.
diff --git a/gensim/interfaces.py b/gensim/interfaces.py
@@ -222,6 +222,11 @@ def __getitem__(self, query):
         if self.num_best is None:
             return result
 
+        # if maintain_sparity is True, result is scipy sparse. Sort, clip the
+        # topn and return as a scipy sparse matrix.
+        if getattr(self, 'maintain_sparsity', False):
+                return matutils.scipy2scipy_clipped(result, self.num_best)
+
         # if the input query was a corpus (=more documents), compute the top-n
         # most similar for each document in turn
         if matutils.ismatrix(result):

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -166,6 +166,43 @@ def any2sparse(vec, eps=1e-9):
     return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps]
 
 
+def scipy2scipy_clipped(matrix, topn, eps=1e-9):
+    """
+    Return a scipy.sparse vector/matrix consisting of 'topn' elements of the greatest magnitude (absolute value).
+    """
+    if not scipy.sparse.issparse(matrix):
+        raise ValueError("'%s' is not a scipy sparse vector." % matrix)
+    if topn <= 0:
+        return scipy.sparse.csr_matrix([])
+    # Return clipped sparse vector if input is a sparse vector.
+    if matrix.shape[0] == 1:
+        # use np.argpartition/argsort and only form tuples that are actually returned.
+        biggest = argsort(abs(matrix.data), topn, reverse=True)
+        indices, data = matrix.indices.take(biggest), matrix.data.take(biggest)
+        return scipy.sparse.csr_matrix((data, indices, [0, len(indices)]))
+    # Return clipped sparse matrix if input is a matrix, processing row by row.
+    else:
+        matrix_indices = []
+        matrix_data = []
+        matrix_indptr = [0]
+        # calling abs() on entire matrix once is faster than calling abs() iteratively for each row
+        matrix_abs = abs(matrix)
+        for i in range(matrix.shape[0]):
+            v = matrix.getrow(i)
+            v_abs = matrix_abs.getrow(i)
+            # Sort and clip each row vector first.
+            biggest = argsort(v_abs.data, topn, reverse=True)
+            indices, data = v.indices.take(biggest), v.data.take(biggest)
+            # Store the topn indices and values of each row vector.
+            matrix_data.append(data)
+            matrix_indices.append(indices)
+            matrix_indptr.append(matrix_indptr[-1] + min(len(indices), topn))
+        matrix_indices = np.concatenate(matrix_indices).ravel()
+        matrix_data = np.concatenate(matrix_data).ravel()
+        # Instantiate and return a sparse csr_matrix which preserves the order of indices/data.
+        return scipy.sparse.csr.csr_matrix((matrix_data, matrix_indices, matrix_indptr), shape=(matrix.shape[0], np.max(matrix_indices) + 1))
+
+
 def scipy2sparse(vec, eps=1e-9):
     """Convert a scipy.sparse vector into gensim document format (=list of 2-tuples)."""
     vec = vec.tocsr()

diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py
@@ -109,6 +109,22 @@ def test_full2sparse_clipped(self):
         expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
         self.assertTrue(matutils.full2sparse_clipped(vec, topn=3), expected)
 
+    def test_scipy2scipy_clipped(self):
+        # Test for scipy vector/row
+        vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15]
+        expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
+        vec_scipy = scipy.sparse.csr_matrix(vec)
+        vec_scipy_clipped = matutils.scipy2scipy_clipped(vec_scipy, topn=3)
+        self.assertTrue(scipy.sparse.issparse(vec_scipy_clipped))
+        self.assertTrue(matutils.scipy2sparse(vec_scipy_clipped), expected)
+
+        # Test for scipy matrix
+        vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15]
+        expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
+        matrix_scipy = scipy.sparse.csr_matrix([vec] * 3)
+        matrix_scipy_clipped = matutils.scipy2scipy_clipped(matrix_scipy, topn=3)
+        self.assertTrue(scipy.sparse.issparse(matrix_scipy_clipped))
+        self.assertTrue([matutils.scipy2sparse(x) for x in matrix_scipy_clipped], [expected] * 3)
 
 
     def testChunking(self):
@@ -406,6 +422,21 @@ def testMaintainSparsity(self):
         self.assertTrue(scipy.sparse.issparse(sparse_sims))
         numpy.testing.assert_array_equal(dense_sims, sparse_sims.todense())
 
+    def testMaintainSparsityWithNumBest(self):
+        """Tests that sparsity is correctly maintained when maintain_sparsity=True and num_best is not None"""
+        num_features = len(dictionary)
+
+        index = self.cls(corpus, num_features=num_features, maintain_sparsity=False, num_best=3)
+        dense_topn_sims = index[corpus]
+
+        index = self.cls(corpus, num_features=num_features, maintain_sparsity=True, num_best=3)
+        scipy_topn_sims = index[corpus]
+
+        self.assertFalse(scipy.sparse.issparse(dense_topn_sims))
+        self.assertTrue(scipy.sparse.issparse(scipy_topn_sims))
+        self.assertEqual(dense_topn_sims, [matutils.scipy2sparse(v) for v in scipy_topn_sims])
+
+
 
 class TestSimilarity(unittest.TestCase, _TestSimilarityABC):
     def setUp(self):