Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix empty output bug in Phrases. Fix #1401 #1853

Merged
merged 8 commits into from
Feb 15, 2018
6 changes: 5 additions & 1 deletion gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ def _is_single(obj):
is a corpus if it is an iterable of documents.
"""
obj_iter = iter(obj)
temp_iter = obj_iter
try:
peek = next(obj_iter)
obj_iter = it.chain([peek], obj_iter)
Expand All @@ -113,9 +114,12 @@ def _is_single(obj):
if isinstance(peek, string_types):
# It's a document, return the iterator
return True, obj_iter
if temp_iter == obj:
# Checking for iterator to the object
return False, obj_iter
else:
# If the first item isn't a string, assume obj is a corpus
return False, obj_iter
return False, obj


class SentenceAnalyzer(object):
Expand Down
7 changes: 7 additions & 0 deletions gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,13 @@ def setUp(self):
self.bigram_unicode = Phrases(
self.unicode_sentences, min_count=1, threshold=1, common_terms=self.common_terms)

def testEmptyPhrasifiedSentencesIterator(self):
bigram_phrases = Phrases(self.sentences)
bigram_phraser = Phraser(bigram_phrases)
trigram_phrases = Phrases(bigram_phraser[self.sentences])
trigram_phraser = Phraser(trigram_phrases)
self.assertNotEqual(trigram_phraser[bigram_phraser[self.sentences]].__len__(), 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it's better to make it more similar with example in PR

trigrams = trigram_phraser[bigram_phraser[self.sentences]]
fst, snd = list(trigrams), list(trigrams)
self.assertEqual(fst, snd)
self.assertNotEqual(snd, [])


def testEmptyInputsOnBigramConstruction(self):
"""Test that empty inputs don't throw errors and return the expected result."""
# Empty list -> empty list
Expand Down