Skip to content

Commit

Permalink
Phrases and Phraser allow a generator corpus (#1099)
Browse files Browse the repository at this point in the history
* Phrases and Phraser allow a generator corpus

Allow Phrases and Phraser models to take a generator
function/expression as input to the transformation method. Previously,
only indexable iterables could be used which is problematic for large
corpora.

Add additional tests to test_phrases using a generator as input.

* Update and simplify _is_single predicate

The _is_single function in phrases.py is now simpler and has the same
contract as the is_corpus function in utils.

* Fixes use of next in phrases, and adds tests for empty input to phrases models
  • Loading branch information
ELind77 authored and tmylk committed Jan 27, 2017
1 parent 3cb735b commit 34759be
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 10 deletions.
39 changes: 29 additions & 10 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,39 @@
import logging
import warnings
from collections import defaultdict
import itertools as it

from six import iteritems, string_types
from six import iteritems, string_types, next

from gensim import utils, interfaces

logger = logging.getLogger(__name__)


def _is_single(obj):
"""
Check whether `obj` is a single document or an entire corpus.
Returns (is_single, new) 2-tuple, where `new` yields the same
sequence as `obj`.
`obj` is a single document if it is an iterable of strings. It
is a corpus if it is an iterable of documents.
"""
obj_iter = iter(obj)
try:
peek = next(obj_iter)
obj_iter = it.chain([peek], obj_iter)
except StopIteration:
# An empty object is a single document
return True, obj
if isinstance(peek, string_types):
# It's a document, return the iterator
return True, obj_iter
else:
# If the first item isn't a string, assume obj is a corpus
return False, obj_iter


class Phrases(interfaces.TransformationABC):
"""
Detect phrases, based on collected collocation counts. Adjacent words that appear
Expand Down Expand Up @@ -246,10 +271,8 @@ def __getitem__(self, sentence):
"""
warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
try:
is_single = not sentence or isinstance(sentence[0], string_types)
except:
is_single = False

is_single, sentence = _is_single(sentence)
if not is_single:
# if the input is an entire corpus (rather than a single sentence),
# return an iterable stream.
Expand Down Expand Up @@ -327,7 +350,6 @@ def __init__(self, phrases_model):
logger.info('Phraser added %i phrasegrams', count)
logger.info('Phraser built with %i %i phrasegrams', count, len(self.phrasegrams))


def __getitem__(self, sentence):
"""
Convert the input tokens `sentence` (=list of unicode strings) into phrase
Expand All @@ -339,10 +361,7 @@ def __getitem__(self, sentence):
into phrases on the fly, one after another.
"""
try:
is_single = not sentence or isinstance(sentence[0], string_types)
except:
is_single = False
is_single, sentence = _is_single(sentence)
if not is_single:
# if the input is an entire corpus (rather than a single sentence),
# return an iterable stream.
Expand Down
36 changes: 36 additions & 0 deletions gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences]


def gen_sentences():
return ((w for w in sentence) for sentence in sentences)


class TestPhrasesCommon(unittest.TestCase):
""" Tests that need to be run for both Prases and Phraser classes."""
def setUp(self):
Expand All @@ -46,11 +50,29 @@ def setUp(self):
self.bigram_utf8 = Phrases(sentences, min_count=1, threshold=1)
self.bigram_unicode = Phrases(unicode_sentences, min_count=1, threshold=1)

def testEmptyInputsOnBigramConstruction(self):
"""Test that empty inputs don't throw errors and return the expected result."""
# Empty list -> empty list
self.assertEqual(list(self.bigram_default[[]]), [])
# Empty iterator -> empty list
self.assertEqual(list(self.bigram_default[iter(())]), [])
# List of empty list -> list of empty list
self.assertEqual(list(self.bigram_default[[[], []]]), [[], []])
# Iterator of empty list -> list of empty list
self.assertEqual(list(self.bigram_default[iter([[], []])]), [[], []])
# Iterator of empty iterator -> list of empty list
self.assertEqual(list(self.bigram_default[(iter(()) for i in range(2))]), [[], []])

def testSentenceGeneration(self):
"""Test basic bigram using a dummy corpus."""
# test that we generate the same amount of sentences as the input
self.assertEqual(len(sentences), len(list(self.bigram_default[sentences])))

def testSentenceGenerationWithGenerator(self):
"""Test basic bigram production when corpus is a generator."""
self.assertEqual(len(list(gen_sentences())),
len(list(self.bigram_default[gen_sentences()])))

def testBigramConstruction(self):
"""Test Phrases bigram construction building."""
# with this setting we should get response_time and graph_minors
Expand All @@ -75,6 +97,20 @@ def testBigramConstruction(self):
self.assertTrue(u'graph_minors' in self.bigram[sentences[-1]])
self.assertTrue(u'human_interface' in self.bigram[sentences[-1]])

def testBigramConstructionFromGenerator(self):
"""Test Phrases bigram construction building when corpus is a generator"""
bigram1_seen = False
bigram2_seen = False

for s in self.bigram[gen_sentences()]:
if not bigram1_seen and 'response_time' in s:
bigram1_seen = True
if not bigram2_seen and 'graph_minors' in s:
bigram2_seen = True
if bigram1_seen and bigram2_seen:
break
self.assertTrue(bigram1_seen and bigram2_seen)

def testEncoding(self):
"""Test that both utf8 and unicode input work; output must be unicode."""
expected = [u'survey', u'user', u'computer', u'system', u'response_time']
Expand Down

0 comments on commit 34759be

Please sign in to comment.