Skip to content

Commit

Permalink
Add LsiModel.docs_processed attribute (#763)
Browse files Browse the repository at this point in the history
  • Loading branch information
hobson authored and tmylk committed Jun 30, 2016
1 parent 9cbc9ca commit 9f7fee2
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 50 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Changes
- In hdpmodel and dtmmodel
- NOT BACKWARDS COMPATIBLE!
* Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113)
* Implemented LsiModel.docs_processed attribute

0.13.1, 2016-06-22

Expand Down
3 changes: 3 additions & 0 deletions gensim/models/lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ def add_documents(self, corpus, chunksize=None, decay=None):
num_terms=self.num_terms, chunksize=chunksize,
extra_dims=self.extra_samples, power_iters=self.power_iters)
self.projection.merge(update, decay=decay)
self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0
else:
# the one-pass algo
doc_no = 0
Expand Down Expand Up @@ -395,6 +396,7 @@ def add_documents(self, corpus, chunksize=None, decay=None):
if self.dispatcher:
logger.info("reached the end of input; now waiting for all remaining jobs to finish")
self.projection = self.dispatcher.getstate()
self.docs_processed += doc_no
# logger.info("top topics after adding %i documents" % doc_no)
# self.print_debug(10)
else:
Expand All @@ -403,6 +405,7 @@ def add_documents(self, corpus, chunksize=None, decay=None):
update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters)
self.projection.merge(update, decay=decay)
logger.info("processed sparse job of %i documents", corpus.shape[1])
self.docs_processed += corpus.shape[1]

def __str__(self):
return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % (
Expand Down
20 changes: 15 additions & 5 deletions gensim/test/test_corpora.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@

import numpy

from gensim.utils import to_unicode, smart_extension
from gensim.utils import to_unicode
from gensim.interfaces import TransformedCorpus
from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
ucicorpus, malletcorpus, textcorpus, indexedcorpus)

# needed because sample data files are located in the same folder
module_path = os.path.dirname(__file__)
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)


def testfile():
Expand Down Expand Up @@ -180,7 +183,7 @@ def test_indexing(self):
self.assertEqual(len(docs), len(corpus))
self.assertEqual(len(docs), len(corpus[:]))
self.assertEqual(len(docs[::2]), len(corpus[::2]))

def _get_slice(corpus, slice_):
# assertRaises for python 2.6 takes a callable
return corpus[slice_]
Expand All @@ -200,9 +203,9 @@ def _get_slice(corpus, slice_):
# corpus does, and throws an error otherwise
if hasattr(corpus, 'index') and corpus.index is not None:
corpus_ = TransformedCorpus(DummyTransformer(), corpus)
self.assertEqual(corpus_[0][0][1], docs[0][0][1]+1)
self.assertEqual(corpus_[0][0][1], docs[0][0][1] + 1)
self.assertRaises(ValueError, _get_slice, corpus_, set([1]))
transformed_docs = [val+1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4]]
transformed_docs = [val + 1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4]]
self.assertEquals(transformed_docs, list(v for doc in corpus_[[1, 3, 4]] for _, v in doc))
self.assertEqual(3, len(corpus_[[1, 3, 4]]))
else:
Expand All @@ -214,12 +217,19 @@ def _get_slice(corpus, slice_):
class TestMmCorpus(CorpusTestCase):
def setUp(self):
self.corpus_class = mmcorpus.MmCorpus
self.corpus = self.corpus_class(datapath('testcorpus.mm'))
self.file_extension = '.mm'

def test_serialize_compressed(self):
# MmCorpus needs file write with seek => doesn't support compressed output (only input)
pass

def test_load(self):
self.assertEqual(self.corpus.num_docs, 9)
self.assertEqual(self.corpus.num_terms, 12)
self.assertEqual(self.corpus.num_nnz, 28)
self.assertEqual(tuple(self.corpus.index), (97, 121, 169, 201, 225, 249, 258, 276, 303))


class TestSvmLightCorpus(CorpusTestCase):
def setUp(self):
Expand Down
92 changes: 47 additions & 45 deletions gensim/test/test_lsimodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,20 +24,23 @@
from gensim import matutils


module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder


def datapath(fname):
return os.path.join(module_path, 'test_data', fname)


# set up vars used in testing ("Deerwester" from the web tutorial)
texts = [['human', 'interface', 'computer'],
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
['survey', 'user', 'computer', 'system', 'response', 'time'],
['eps', 'user', 'interface', 'system'],
['system', 'human', 'system', 'eps'],
['user', 'response', 'time'],
['trees'],
['graph', 'trees'],
['graph', 'minors', 'trees'],
['graph', 'minors', 'survey']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

Expand All @@ -59,16 +62,15 @@ def testTransform(self):

# make sure the decomposition is enough accurate
u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False)
self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match
self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match

# transform one document
doc = list(self.corpus)[0]
transformed = model[doc]
vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version
# expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign

vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version
# expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign

def testShowTopic(self):
topic = self.model.show_topic(1)
Expand All @@ -77,7 +79,6 @@ def testShowTopic(self):
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))


def testShowTopics(self):
topics = self.model.show_topics(formatted=False)

Expand All @@ -88,58 +89,55 @@ def testShowTopics(self):
self.assertTrue(isinstance(k, six.string_types))
self.assertTrue(isinstance(v, float))


def testCorpusTransform(self):
"""Test lsi[corpus] transformation."""
model = self.model
got = numpy.vstack(matutils.sparse2full(doc, 2) for doc in model[self.corpus])
expected = numpy.array([
[ 0.65946639, 0.14211544],
[ 2.02454305, -0.42088759],
[ 1.54655361, 0.32358921],
[ 1.81114125, 0.5890525 ],
[ 0.9336738 , -0.27138939],
[ 0.01274618, -0.49016181],
[ 0.04888203, -1.11294699],
[ 0.08063836, -1.56345594],
[ 0.27381003, -1.34694159]])
self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign

[0.65946639, 0.14211544],
[2.02454305, -0.42088759],
[1.54655361, 0.32358921],
[1.81114125, 0.5890525 ],
[0.9336738 , -0.27138939],
[0.01274618, -0.49016181],
[0.04888203, -1.11294699],
[0.08063836, -1.56345594],
[0.27381003, -1.34694159]])
self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign

def testOnlineTransform(self):
corpus = list(self.corpus)
doc = corpus[0] # use the corpus' first document for testing
doc = corpus[0] # use the corpus' first document for testing

# create the transformation model
model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once
model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later
model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once
model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later

# train model on a single document
model.add_documents([corpus[0]])

# transform the testing document with this partial transformation
transformed = model[doc]
vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign
vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign

# train on another 4 documents
model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols
model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols

# transform a document with this partial transformation
transformed = model[doc]
vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign
vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version
self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign

# train on the rest of documents
model.add_documents(corpus[5:])

# make sure the final transformation is the same as if we had decomposed the whole corpus at once
vec1 = matutils.sparse2full(model[doc], model.num_topics)
vec2 = matutils.sparse2full(model2[doc], model2.num_topics)
self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign

self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign

def testPersistence(self):
fname = testfile()
Expand All @@ -150,7 +148,7 @@ def testPersistence(self):
self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

def testPersistenceCompressed(self):
fname = testfile() + '.gz'
Expand All @@ -161,7 +159,7 @@ def testPersistenceCompressed(self):
self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

def testLargeMmap(self):
fname = testfile()
Expand All @@ -178,7 +176,7 @@ def testLargeMmap(self):
self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
tstvec = []
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector

def testLargeMmapCompressed(self):
fname = testfile() + '.gz'
Expand All @@ -194,7 +192,11 @@ def testLargeMmapCompressed(self):
# to be mmaped!
self.assertRaises(IOError, lsimodel.LsiModel.load, fname, mmap='r')

#endclass TestLsiModel
def testDocsProcessed(self):
self.assertEqual(self.model.docs_processed, 9)
self.assertEqual(self.model.docs_processed, self.corpus.num_docs)

# endclass TestLsiModel


if __name__ == '__main__':
Expand Down

0 comments on commit 9f7fee2

Please sign in to comment.