Skip to content

Commit

Permalink
fixes after review:
Browse files Browse the repository at this point in the history
- hanged indent
- return assertTrue line
- fix bug in _get_average_score in python2
- a bit refactor bm25
  • Loading branch information
horpto committed Dec 14, 2018
1 parent b0037ef commit 14339f2
Show file tree
Hide file tree
Showing 5 changed files with 13 additions and 5 deletions.
4 changes: 2 additions & 2 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,7 +568,7 @@ def init_dir_prior(self, prior, name):
if prior == 'symmetric':
logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
init_prior = np.fromiter((1.0 / self.num_topics for i in xrange(prior_shape)),
dtype=self.dtype, count=prior_shape)
dtype=self.dtype, count=prior_shape)
elif prior == 'asymmetric':
init_prior = \
np.fromiter((1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)),
Expand All @@ -578,7 +578,7 @@ def init_dir_prior(self, prior, name):
elif prior == 'auto':
is_auto = True
init_prior = np.fromiter((1.0 / self.num_topics for i in xrange(prior_shape)),
dtype=self.dtype, count=prior_shape)
dtype=self.dtype, count=prior_shape)
if name == 'alpha':
logger.info("using autotuned %s, starting with %s", name, list(init_prior))
else:
Expand Down
10 changes: 8 additions & 2 deletions gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,17 +79,21 @@ def __init__(self, corpus):
"""
self.corpus_size = len(corpus)
self.doc_len = [len(x) for x in corpus]
self.avgdl = float(sum(self.doc_len)) / self.corpus_size
self.avgdl = 0
self.corpus = corpus
self.f = []
self.df = {}
self.idf = {}
self.doc_len = []
self.initialize()

def initialize(self):
"""Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
num_doc = 0
for document in self.corpus:
num_doc += len(document)
self.doc_len.append(len(document))

frequencies = {}
for word in document:
if word not in frequencies:
Expand All @@ -102,6 +106,8 @@ def initialize(self):
self.df[word] = 0
self.df[word] += 1

self.avgdl = float(num_doc) / self.corpus_size

for word, freq in iteritems(self.df):
self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

Expand Down
2 changes: 1 addition & 1 deletion gensim/summarization/keywords.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ def _get_average_score(concept, _keywords):
"""
word_list = concept.split()
word_counter = len(word_list)
total = sum(_keywords[word] for word in word_list)
total = float(sum(_keywords[word] for word in word_list))
return total / word_counter


Expand Down
1 change: 1 addition & 0 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,7 @@ def online_sanity(self, model):
terro.append(l)
else:
others.append(l)
self.assertTrue(all('terrorism' not in l for l in others))
model.build_vocab(others)
model.train(others, total_examples=model.corpus_count, epochs=model.epochs)
# checks that `vectors` is different from `vectors_vocab`
Expand Down
1 change: 1 addition & 0 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,7 @@ def onlineSanity(self, model, trained_model=False):
terro.append(l)
else:
others.append(l)
self.assertTrue(all('terrorism' not in l for l in others))
model.build_vocab(others, update=trained_model)
model.train(others, total_examples=model.corpus_count, epochs=model.epochs)
self.assertFalse('terrorism' in model.wv.vocab)
Expand Down

0 comments on commit 14339f2

Please sign in to comment.