Add LsiModel.docs_processed attribute (#763)

piskvorky · Jun 30, 2016 · 9f7fee2 · 9f7fee2
1 parent 9cbc9ca
commit 9f7fee2
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 50 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@ Changes
   - In hdpmodel and dtmmodel
   - NOT BACKWARDS COMPATIBLE!
 * Added random_state parameter to LdaState initializer and check_random_state() (@droudy, #113)
+* Implemented LsiModel.docs_processed attribute
 
 0.13.1, 2016-06-22
 

diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py
@@ -361,6 +361,7 @@ def add_documents(self, corpus, chunksize=None, decay=None):
                     num_terms=self.num_terms, chunksize=chunksize,
                     extra_dims=self.extra_samples, power_iters=self.power_iters)
                 self.projection.merge(update, decay=decay)
+                self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0
             else:
                 # the one-pass algo
                 doc_no = 0
@@ -395,6 +396,7 @@ def add_documents(self, corpus, chunksize=None, decay=None):
                 if self.dispatcher:
                     logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                     self.projection = self.dispatcher.getstate()
+                self.docs_processed += doc_no
 #            logger.info("top topics after adding %i documents" % doc_no)
 #            self.print_debug(10)
         else:
@@ -403,6 +405,7 @@ def add_documents(self, corpus, chunksize=None, decay=None):
             update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters)
             self.projection.merge(update, decay=decay)
             logger.info("processed sparse job of %i documents", corpus.shape[1])
+            self.docs_processed += corpus.shape[1]
 
     def __str__(self):
         return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % (

diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py
@@ -16,14 +16,17 @@
 
 import numpy
 
-from gensim.utils import to_unicode, smart_extension
+from gensim.utils import to_unicode
 from gensim.interfaces import TransformedCorpus
 from gensim.corpora import (bleicorpus, mmcorpus, lowcorpus, svmlightcorpus,
                             ucicorpus, malletcorpus, textcorpus, indexedcorpus)
 
 # needed because sample data files are located in the same folder
 module_path = os.path.dirname(__file__)
-datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
+
+
+def datapath(fname):
+    return os.path.join(module_path, 'test_data', fname)
 
 
 def testfile():
@@ -180,7 +183,7 @@ def test_indexing(self):
         self.assertEqual(len(docs), len(corpus))
         self.assertEqual(len(docs), len(corpus[:]))
         self.assertEqual(len(docs[::2]), len(corpus[::2]))
-        
+
         def _get_slice(corpus, slice_):
             # assertRaises for python 2.6 takes a callable
             return corpus[slice_]
@@ -200,9 +203,9 @@ def _get_slice(corpus, slice_):
         # corpus does, and throws an error otherwise
         if hasattr(corpus, 'index') and corpus.index is not None:
             corpus_ = TransformedCorpus(DummyTransformer(), corpus)
-            self.assertEqual(corpus_[0][0][1], docs[0][0][1]+1)
+            self.assertEqual(corpus_[0][0][1], docs[0][0][1] + 1)
             self.assertRaises(ValueError, _get_slice, corpus_, set([1]))
-            transformed_docs = [val+1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4]]
+            transformed_docs = [val + 1 for i, d in enumerate(docs) for _, val in d if i in [1, 3, 4]]
             self.assertEquals(transformed_docs, list(v for doc in corpus_[[1, 3, 4]] for _, v in doc))
             self.assertEqual(3, len(corpus_[[1, 3, 4]]))
         else:
@@ -214,12 +217,19 @@ def _get_slice(corpus, slice_):
 class TestMmCorpus(CorpusTestCase):
     def setUp(self):
         self.corpus_class = mmcorpus.MmCorpus
+        self.corpus = self.corpus_class(datapath('testcorpus.mm'))
         self.file_extension = '.mm'
 
     def test_serialize_compressed(self):
         # MmCorpus needs file write with seek => doesn't support compressed output (only input)
         pass
 
+    def test_load(self):
+        self.assertEqual(self.corpus.num_docs, 9)
+        self.assertEqual(self.corpus.num_terms, 12)
+        self.assertEqual(self.corpus.num_nnz, 28)
+        self.assertEqual(tuple(self.corpus.index), (97, 121, 169, 201, 225, 249, 258, 276, 303))
+
 
 class TestSvmLightCorpus(CorpusTestCase):
     def setUp(self):

diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py
@@ -24,20 +24,23 @@
 from gensim import matutils
 
 
-module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
-datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
+module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
+
+
+def datapath(fname):
+    return os.path.join(module_path, 'test_data', fname)
 
 
 # set up vars used in testing ("Deerwester" from the web tutorial)
 texts = [['human', 'interface', 'computer'],
- ['survey', 'user', 'computer', 'system', 'response', 'time'],
- ['eps', 'user', 'interface', 'system'],
- ['system', 'human', 'system', 'eps'],
- ['user', 'response', 'time'],
- ['trees'],
- ['graph', 'trees'],
- ['graph', 'minors', 'trees'],
- ['graph', 'minors', 'survey']]
+         ['survey', 'user', 'computer', 'system', 'response', 'time'],
+         ['eps', 'user', 'interface', 'system'],
+         ['system', 'human', 'system', 'eps'],
+         ['user', 'response', 'time'],
+         ['trees'],
+         ['graph', 'trees'],
+         ['graph', 'minors', 'trees'],
+         ['graph', 'minors', 'survey']]
 dictionary = Dictionary(texts)
 corpus = [dictionary.doc2bow(text) for text in texts]
 
@@ -59,16 +62,15 @@ def testTransform(self):
 
         # make sure the decomposition is enough accurate
         u, s, vt = scipy.linalg.svd(matutils.corpus2dense(self.corpus, self.corpus.num_terms), full_matrices=False)
-        self.assertTrue(numpy.allclose(s[:2], model.projection.s)) # singular values must match
+        self.assertTrue(numpy.allclose(s[:2], model.projection.s))  # singular values must match
 
         # transform one document
         doc = list(self.corpus)[0]
         transformed = model[doc]
-        vec = matutils.sparse2full(transformed, 2) # convert to dense vector, for easier equality tests
-        expected = numpy.array([-0.6594664, 0.142115444]) # scaled LSI version
-        # expected = numpy.array([-0.1973928, 0.05591352]) # non-scaled LSI version
-        self.assertTrue(numpy.allclose(abs(vec), abs(expected))) # transformed entries must be equal up to sign
-
+        vec = matutils.sparse2full(transformed, 2)  # convert to dense vector, for easier equality tests
+        expected = numpy.array([-0.6594664, 0.142115444])  # scaled LSI version
+        # expected = numpy.array([-0.1973928, 0.05591352])  # non-scaled LSI version
+        self.assertTrue(numpy.allclose(abs(vec), abs(expected)))  # transformed entries must be equal up to sign
 
     def testShowTopic(self):
         topic = self.model.show_topic(1)
@@ -77,7 +79,6 @@ def testShowTopic(self):
             self.assertTrue(isinstance(k, six.string_types))
             self.assertTrue(isinstance(v, float))
 
-
     def testShowTopics(self):
         topics = self.model.show_topics(formatted=False)
 
@@ -88,58 +89,55 @@ def testShowTopics(self):
                 self.assertTrue(isinstance(k, six.string_types))
                 self.assertTrue(isinstance(v, float))
 
-
     def testCorpusTransform(self):
         """Test lsi[corpus] transformation."""
         model = self.model
         got = numpy.vstack(matutils.sparse2full(doc, 2) for doc in model[self.corpus])
         expected = numpy.array([
-            [ 0.65946639,  0.14211544],
-            [ 2.02454305, -0.42088759],
-            [ 1.54655361,  0.32358921],
-            [ 1.81114125,  0.5890525 ],
-            [ 0.9336738 , -0.27138939],
-            [ 0.01274618, -0.49016181],
-            [ 0.04888203, -1.11294699],
-            [ 0.08063836, -1.56345594],
-            [ 0.27381003, -1.34694159]])
-        self.assertTrue(numpy.allclose(abs(got), abs(expected))) # must equal up to sign
-
+            [0.65946639,  0.14211544],
+            [2.02454305, -0.42088759],
+            [1.54655361,  0.32358921],
+            [1.81114125,  0.5890525 ],
+            [0.9336738 , -0.27138939],
+            [0.01274618, -0.49016181],
+            [0.04888203, -1.11294699],
+            [0.08063836, -1.56345594],
+            [0.27381003, -1.34694159]])
+        self.assertTrue(numpy.allclose(abs(got), abs(expected)))  # must equal up to sign
 
     def testOnlineTransform(self):
         corpus = list(self.corpus)
-        doc = corpus[0] # use the corpus' first document for testing
+        doc = corpus[0]  # use the corpus' first document for testing
 
         # create the transformation model
-        model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5) # compute everything at once
-        model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5) # start with no documents, we will add them later
+        model2 = lsimodel.LsiModel(corpus=corpus, num_topics=5)  # compute everything at once
+        model = lsimodel.LsiModel(corpus=None, id2word=model2.id2word, num_topics=5)  # start with no documents, we will add them later
 
         # train model on a single document
         model.add_documents([corpus[0]])
 
         # transform the testing document with this partial transformation
         transformed = model[doc]
-        vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
-        expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0]) # scaled LSI version
-        self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign
+        vec = matutils.sparse2full(transformed, model.num_topics)  # convert to dense vector, for easier equality tests
+        expected = numpy.array([-1.73205078, 0.0, 0.0, 0.0, 0.0])  # scaled LSI version
+        self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6))  # transformed entries must be equal up to sign
 
         # train on another 4 documents
-        model.add_documents(corpus[1:5], chunksize=2) # train on 4 extra docs, in chunks of 2 documents, for the lols
+        model.add_documents(corpus[1:5], chunksize=2)  # train on 4 extra docs, in chunks of 2 documents, for the lols
 
         # transform a document with this partial transformation
         transformed = model[doc]
-        vec = matutils.sparse2full(transformed, model.num_topics) # convert to dense vector, for easier equality tests
-        expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269]) # scaled LSI version
-        self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6)) # transformed entries must be equal up to sign
+        vec = matutils.sparse2full(transformed, model.num_topics)  # convert to dense vector, for easier equality tests
+        expected = numpy.array([-0.66493785, -0.28314203, -1.56376302, 0.05488682, 0.17123269])  # scaled LSI version
+        self.assertTrue(numpy.allclose(abs(vec), abs(expected), atol=1e-6))  # transformed entries must be equal up to sign
 
         # train on the rest of documents
         model.add_documents(corpus[5:])
 
         # make sure the final transformation is the same as if we had decomposed the whole corpus at once
         vec1 = matutils.sparse2full(model[doc], model.num_topics)
         vec2 = matutils.sparse2full(model2[doc], model2.num_topics)
-        self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5)) # the two LSI representations must equal up to sign
-
+        self.assertTrue(numpy.allclose(abs(vec1), abs(vec2), atol=1e-5))  # the two LSI representations must equal up to sign
 
     def testPersistence(self):
         fname = testfile()
@@ -150,7 +148,7 @@ def testPersistence(self):
         self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
         self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
         tstvec = []
-        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
+        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
 
     def testPersistenceCompressed(self):
         fname = testfile() + '.gz'
@@ -161,7 +159,7 @@ def testPersistenceCompressed(self):
         self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
         self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
         tstvec = []
-        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
+        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
 
     def testLargeMmap(self):
         fname = testfile()
@@ -178,7 +176,7 @@ def testLargeMmap(self):
         self.assertTrue(numpy.allclose(model.projection.u, model2.projection.u))
         self.assertTrue(numpy.allclose(model.projection.s, model2.projection.s))
         tstvec = []
-        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector
+        self.assertTrue(numpy.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
 
     def testLargeMmapCompressed(self):
         fname = testfile() + '.gz'
@@ -194,7 +192,11 @@ def testLargeMmapCompressed(self):
         # to be mmaped!
         self.assertRaises(IOError, lsimodel.LsiModel.load, fname, mmap='r')
 
-#endclass TestLsiModel
+    def testDocsProcessed(self):
+        self.assertEqual(self.model.docs_processed, 9)
+        self.assertEqual(self.model.docs_processed, self.corpus.num_docs)
+
+# endclass TestLsiModel
 
 
 if __name__ == '__main__':