From accc6253e40b424a081b0c0c3c65d765682caa06 Mon Sep 17 00:00:00 2001 From: Philip Robinson Date: Wed, 18 Jul 2018 09:40:51 -0700 Subject: [PATCH 1/5] test for #1589 --- gensim/test/test_atmodel.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index d2625f6ede..573f2f0258 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -49,15 +49,19 @@ ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], - ['graph', 'minors', 'survey']] + ['graph', 'minors', 'survey'], + ['only_occurs_once_in_corpus_and_alone_in_doc'], +] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # Assign some authors randomly to the documents above. -author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7]} +author2doc = {'john': [0, 1, 2, 3, 4, 5, 6], 'jane': [2, 3, 4, 5, 6, 7, 8], 'jack': [0, 2, 4, 6, 8], 'jill': [1, 3, 5, 7], 'joaquin': [9]} doc2author = {0: ['john', 'jack'], 1: ['john', 'jill'], 2: ['john', 'jane', 'jack'], 3: ['john', 'jane', 'jill'], 4: ['john', 'jane', 'jack'], 5: ['john', 'jane', 'jill'], 6: ['john', 'jane', 'jack'], 7: ['jane', 'jill'], - 8: ['jane', 'jack']} + 8: ['jane', 'jack'], + 9: ['juaqin'], +} # More data with new and old authors (to test update method). # Although the text is just a subset of the previous, the model @@ -116,6 +120,15 @@ def testBasic(self): jill_topics = matutils.sparse2full(jill_topics, model.num_topics) self.assertTrue(all(jill_topics > 0)) + def testEmptyDocument(self): + _dictionary = Dictionary(texts) + _dictionary.filter_extremes(no_below=2) + _corpus = [_dictionary.doc2bow(text) for text in texts] + try: + model = self.class_(_corpus, author2doc=author2doc, id2word=_dictionary, num_topics=2) + except IndexError: + raise IndexError("error occurs in 1.0.0 release tag") + def testAuthor2docMissing(self): # Check that the results are the same if author2doc is constructed automatically from doc2author. model = self.class_(corpus, author2doc=author2doc, doc2author=doc2author, id2word=dictionary, num_topics=2, random_state=0) From e3e47efa3f732e2203d9c9f3719fd66e4078cb29 Mon Sep 17 00:00:00 2001 From: Philip Robinson Date: Wed, 18 Jul 2018 09:34:34 -0700 Subject: [PATCH 2/5] bugfix #1589 --- gensim/models/atmodel.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 4f550b9ffe..c3987ceef0 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -391,15 +391,17 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c doc_no = d # Get the IDs and counts of all the words in the current document. # TODO: this is duplication of code in LdaModel. Refactor. + if doc and not isinstance(doc[0][0], six.integer_types): # make sure the term IDs are ints, otherwise np will get upset ids = [int(id) for id, _ in doc] else: ids = [id for id, _ in doc] - cts = np.array([cnt for _, cnt in doc]) + ids = np.array(ids, dtype=np.integer) + cts = np.array([cnt for _, cnt in doc], dtype=np.integer) # Get all authors in current document, and convert the author names to integer IDs. - authors_d = [self.author2id[a] for a in self.doc2author[doc_no]] + authors_d = np.array([self.author2id[a] for a in self.doc2author[doc_no]], dtype=np.integer) gammad = self.state.gamma[authors_d, :] # gamma of document d before update. tilde_gamma = gammad.copy() # gamma that will be updated. @@ -828,9 +830,9 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, else: doc_no = d # Get all authors in current document, and convert the author names to integer IDs. - authors_d = [self.author2id[a] for a in self.doc2author[doc_no]] - ids = np.array([id for id, _ in doc]) # Word IDs in doc. - cts = np.array([cnt for _, cnt in doc]) # Word counts. + authors_d = np.array([self.author2id[a] for a in self.doc2author[doc_no]], dtype=np.integer) + ids = np.array([id for id, _ in doc], dtype=np.integer) # Word IDs in doc. + cts = np.array([cnt for _, cnt in doc], dtype=np.integer) # Word counts. if d % self.chunksize == 0: logger.debug("bound: at document #%i in chunk", d) From db7453108d67afdf266592eddce47628cc16f7e2 Mon Sep 17 00:00:00 2001 From: Philip Robinson Date: Fri, 20 Jul 2018 15:55:24 -0700 Subject: [PATCH 3/5] ignore unused assigned varaible --- gensim/test/test_atmodel.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index ff1e844aa1..43ff670bd0 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -117,9 +117,10 @@ def testEmptyDocument(self): _a2d = author2doc.copy() _a2d['joaquin'] = [len(_local_texts) - 1] try: - model = self.class_(_corpus, author2doc=_a2d, id2word=_dictionary, num_topics=2) + _ = self.class_(_corpus, author2doc=_a2d, id2word=_dictionary, num_topics=2) except IndexError: raise IndexError("error occurs in 1.0.0 release tag") + assert(_) def testAuthor2docMissing(self): # Check that the results are the same if author2doc is constructed automatically from doc2author. From 8aa04b28c3aee82aff0a0934df7c0a51ed77452f Mon Sep 17 00:00:00 2001 From: Philip Robinson Date: Wed, 1 Aug 2018 12:45:34 -0700 Subject: [PATCH 4/5] PR review --- gensim/models/atmodel.py | 2 +- gensim/test/test_atmodel.py | 18 ++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 28d2ddf6c7..d0a5940512 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -460,7 +460,7 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c # make sure the term IDs are ints, otherwise np will get upset ids = [int(idx) for idx, _ in doc] else: - ids = [id for id, _ in doc] + ids = [idx for idx, _ in doc] ids = np.array(ids, dtype=np.integer) cts = np.array([cnt for _, cnt in doc], dtype=np.integer) diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index 43ff670bd0..63cff65fd7 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -110,16 +110,14 @@ def testBasic(self): self.assertTrue(all(jill_topics > 0)) def testEmptyDocument(self): - _local_texts = common_texts + [['only_occurs_once_in_corpus_and_alone_in_doc']] - _dictionary = Dictionary(_local_texts) - _dictionary.filter_extremes(no_below=2) - _corpus = [_dictionary.doc2bow(text) for text in _local_texts] - _a2d = author2doc.copy() - _a2d['joaquin'] = [len(_local_texts) - 1] - try: - _ = self.class_(_corpus, author2doc=_a2d, id2word=_dictionary, num_topics=2) - except IndexError: - raise IndexError("error occurs in 1.0.0 release tag") + local_texts = common_texts + [['only_occurs_once_in_corpus_and_alone_in_doc']] + dictionary = Dictionary(local_texts) + dictionary.filter_extremes(no_below=2) + corpus = [dictionary.doc2bow(text) for text in local_texts] + a2d = author2doc.copy() + a2d['joaquin'] = [len(local_texts) - 1] + + _ = self.class_(corpus, author2doc=a2d, id2word=dictionary, num_topics=2) assert(_) def testAuthor2docMissing(self): From ddf8dec6337c9547a31646bac615e51e11dcdf7f Mon Sep 17 00:00:00 2001 From: Philip Date: Wed, 1 Aug 2018 20:44:52 -0700 Subject: [PATCH 5/5] Update test_atmodel.py --- gensim/test/test_atmodel.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index 63cff65fd7..50e6a32ea9 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -117,8 +117,7 @@ def testEmptyDocument(self): a2d = author2doc.copy() a2d['joaquin'] = [len(local_texts) - 1] - _ = self.class_(corpus, author2doc=a2d, id2word=dictionary, num_topics=2) - assert(_) + self.class_(corpus, author2doc=a2d, id2word=dictionary, num_topics=2) def testAuthor2docMissing(self): # Check that the results are the same if author2doc is constructed automatically from doc2author.