Skip to content

Commit

Permalink
Add sparse input support in interfaces_getitem when num_best is not N…
Browse files Browse the repository at this point in the history
…one. Fix #1294 (#1321)

* added any2sparse_clipped() function

* changed full2sparse_clipped to any2sparse_clipped in __getitem__

* added missing whitespace

* return topn from any2sparse_clipped()

* efficient any2sparse_clipped implementation

* added unit test for any2sparse_clipped

* function call corrected

* removed any2sparse_clipped and added scipy2scipy_clipped

* added new code path for maintain_sparsity

* added unit tests for new function and issue

* fixed flake8 errors

* fixed matrix_indptr

* added requested changes

* replaced hasattr with getattr

* call abs() once for entire matrix in scipy2scipy_clipped

* removed matrix.sort_indices and removed indptr while calling argsort
  • Loading branch information
manneshiva authored and menshikh-iv committed Jun 22, 2017
1 parent 0d47a6f commit dfd7da4
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 0 deletions.
5 changes: 5 additions & 0 deletions gensim/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,11 @@ def __getitem__(self, query):
if self.num_best is None:
return result

# if maintain_sparity is True, result is scipy sparse. Sort, clip the
# topn and return as a scipy sparse matrix.
if getattr(self, 'maintain_sparsity', False):
return matutils.scipy2scipy_clipped(result, self.num_best)

# if the input query was a corpus (=more documents), compute the top-n
# most similar for each document in turn
if matutils.ismatrix(result):
Expand Down
37 changes: 37 additions & 0 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,43 @@ def any2sparse(vec, eps=1e-9):
return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps]


def scipy2scipy_clipped(matrix, topn, eps=1e-9):
"""
Return a scipy.sparse vector/matrix consisting of 'topn' elements of the greatest magnitude (absolute value).
"""
if not scipy.sparse.issparse(matrix):
raise ValueError("'%s' is not a scipy sparse vector." % matrix)
if topn <= 0:
return scipy.sparse.csr_matrix([])
# Return clipped sparse vector if input is a sparse vector.
if matrix.shape[0] == 1:
# use np.argpartition/argsort and only form tuples that are actually returned.
biggest = argsort(abs(matrix.data), topn, reverse=True)
indices, data = matrix.indices.take(biggest), matrix.data.take(biggest)
return scipy.sparse.csr_matrix((data, indices, [0, len(indices)]))
# Return clipped sparse matrix if input is a matrix, processing row by row.
else:
matrix_indices = []
matrix_data = []
matrix_indptr = [0]
# calling abs() on entire matrix once is faster than calling abs() iteratively for each row
matrix_abs = abs(matrix)
for i in range(matrix.shape[0]):
v = matrix.getrow(i)
v_abs = matrix_abs.getrow(i)
# Sort and clip each row vector first.
biggest = argsort(v_abs.data, topn, reverse=True)
indices, data = v.indices.take(biggest), v.data.take(biggest)
# Store the topn indices and values of each row vector.
matrix_data.append(data)
matrix_indices.append(indices)
matrix_indptr.append(matrix_indptr[-1] + min(len(indices), topn))
matrix_indices = np.concatenate(matrix_indices).ravel()
matrix_data = np.concatenate(matrix_data).ravel()
# Instantiate and return a sparse csr_matrix which preserves the order of indices/data.
return scipy.sparse.csr.csr_matrix((matrix_data, matrix_indices, matrix_indptr), shape=(matrix.shape[0], np.max(matrix_indices) + 1))


def scipy2sparse(vec, eps=1e-9):
"""Convert a scipy.sparse vector into gensim document format (=list of 2-tuples)."""
vec = vec.tocsr()
Expand Down
31 changes: 31 additions & 0 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,22 @@ def test_full2sparse_clipped(self):
expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
self.assertTrue(matutils.full2sparse_clipped(vec, topn=3), expected)

def test_scipy2scipy_clipped(self):
# Test for scipy vector/row
vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15]
expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
vec_scipy = scipy.sparse.csr_matrix(vec)
vec_scipy_clipped = matutils.scipy2scipy_clipped(vec_scipy, topn=3)
self.assertTrue(scipy.sparse.issparse(vec_scipy_clipped))
self.assertTrue(matutils.scipy2sparse(vec_scipy_clipped), expected)

# Test for scipy matrix
vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15]
expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
matrix_scipy = scipy.sparse.csr_matrix([vec] * 3)
matrix_scipy_clipped = matutils.scipy2scipy_clipped(matrix_scipy, topn=3)
self.assertTrue(scipy.sparse.issparse(matrix_scipy_clipped))
self.assertTrue([matutils.scipy2sparse(x) for x in matrix_scipy_clipped], [expected] * 3)


def testChunking(self):
Expand Down Expand Up @@ -406,6 +422,21 @@ def testMaintainSparsity(self):
self.assertTrue(scipy.sparse.issparse(sparse_sims))
numpy.testing.assert_array_equal(dense_sims, sparse_sims.todense())

def testMaintainSparsityWithNumBest(self):
"""Tests that sparsity is correctly maintained when maintain_sparsity=True and num_best is not None"""
num_features = len(dictionary)

index = self.cls(corpus, num_features=num_features, maintain_sparsity=False, num_best=3)
dense_topn_sims = index[corpus]

index = self.cls(corpus, num_features=num_features, maintain_sparsity=True, num_best=3)
scipy_topn_sims = index[corpus]

self.assertFalse(scipy.sparse.issparse(dense_topn_sims))
self.assertTrue(scipy.sparse.issparse(scipy_topn_sims))
self.assertEqual(dense_topn_sims, [matutils.scipy2sparse(v) for v in scipy_topn_sims])



class TestSimilarity(unittest.TestCase, _TestSimilarityABC):
def setUp(self):
Expand Down

0 comments on commit dfd7da4

Please sign in to comment.