fixes after review:

- hanged indent - return assertTrue line - fix bug in _get_average_score in python2 - a bit refactor bm25
piskvorky · Dec 14, 2018 · 14339f2 · 14339f2
1 parent b0037ef
commit 14339f2
Show file tree

Hide file tree

Showing 5 changed files with 13 additions and 5 deletions.
diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
@@ -568,7 +568,7 @@ def init_dir_prior(self, prior, name):
             if prior == 'symmetric':
                 logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
                 init_prior = np.fromiter((1.0 / self.num_topics for i in xrange(prior_shape)),
-                                            dtype=self.dtype, count=prior_shape)
+                                         dtype=self.dtype, count=prior_shape)
             elif prior == 'asymmetric':
                 init_prior = \
                     np.fromiter((1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)),
@@ -578,7 +578,7 @@ def init_dir_prior(self, prior, name):
             elif prior == 'auto':
                 is_auto = True
                 init_prior = np.fromiter((1.0 / self.num_topics for i in xrange(prior_shape)),
-                                            dtype=self.dtype, count=prior_shape)
+                                         dtype=self.dtype, count=prior_shape)
                 if name == 'alpha':
                     logger.info("using autotuned %s, starting with %s", name, list(init_prior))
             else:

diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py
@@ -79,17 +79,21 @@ def __init__(self, corpus):
 
         """
         self.corpus_size = len(corpus)
-        self.doc_len = [len(x) for x in corpus]
-        self.avgdl = float(sum(self.doc_len)) / self.corpus_size
+        self.avgdl = 0
         self.corpus = corpus
         self.f = []
         self.df = {}
         self.idf = {}
+        self.doc_len = []
         self.initialize()
 
     def initialize(self):
         """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
+        num_doc = 0
         for document in self.corpus:
+            num_doc += len(document)
+            self.doc_len.append(len(document))
+
             frequencies = {}
             for word in document:
                 if word not in frequencies:
@@ -102,6 +106,8 @@ def initialize(self):
                     self.df[word] = 0
                 self.df[word] += 1
 
+        self.avgdl = float(num_doc) / self.corpus_size
+
         for word, freq in iteritems(self.df):
             self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
 

diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py
@@ -427,7 +427,7 @@ def _get_average_score(concept, _keywords):
     """
     word_list = concept.split()
     word_counter = len(word_list)
-    total = sum(_keywords[word] for word in word_list)
+    total = float(sum(_keywords[word] for word in word_list))
     return total / word_counter
 
 

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -674,6 +674,7 @@ def online_sanity(self, model):
                 terro.append(l)
             else:
                 others.append(l)
+        self.assertTrue(all('terrorism' not in l for l in others))
         model.build_vocab(others)
         model.train(others, total_examples=model.corpus_count, epochs=model.epochs)
         # checks that `vectors` is different from `vectors_vocab`

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -246,6 +246,7 @@ def onlineSanity(self, model, trained_model=False):
                 terro.append(l)
             else:
                 others.append(l)
+        self.assertTrue(all('terrorism' not in l for l in others))
         model.build_vocab(others, update=trained_model)
         model.train(others, total_examples=model.corpus_count, epochs=model.epochs)
         self.assertFalse('terrorism' in model.wv.vocab)