Phrases and Phraser allow a generator corpus (#1099)

* Phrases and Phraser allow a generator corpus Allow Phrases and Phraser models to take a generator function/expression as input to the transformation method. Previously, only indexable iterables could be used which is problematic for large corpora. Add additional tests to test_phrases using a generator as input. * Update and simplify _is_single predicate The _is_single function in phrases.py is now simpler and has the same contract as the is_corpus function in utils. * Fixes use of next in phrases, and adds tests for empty input to phrases models
piskvorky · Jan 27, 2017 · 34759be · 34759be
1 parent 3cb735b
commit 34759be
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 10 deletions.
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
@@ -62,14 +62,39 @@
 import logging
 import warnings
 from collections import defaultdict
+import itertools as it
 
-from six import iteritems, string_types
+from six import iteritems, string_types, next
 
 from gensim import utils, interfaces
 
 logger = logging.getLogger(__name__)
 
 
+def _is_single(obj):
+    """
+    Check whether `obj` is a single document or an entire corpus.
+    Returns (is_single, new) 2-tuple, where `new` yields the same
+    sequence as `obj`.
+
+    `obj` is a single document if it is an iterable of strings.  It
+    is a corpus if it is an iterable of documents.
+    """
+    obj_iter = iter(obj)
+    try:
+        peek = next(obj_iter)
+        obj_iter = it.chain([peek], obj_iter)
+    except StopIteration:
+        # An empty object is a single document
+        return True, obj
+    if isinstance(peek, string_types):
+        # It's a document, return the iterator
+        return True, obj_iter
+    else:
+        # If the first item isn't a string, assume obj is a corpus
+        return False, obj_iter
+
+
 class Phrases(interfaces.TransformationABC):
     """
     Detect phrases, based on collected collocation counts. Adjacent words that appear
@@ -246,10 +271,8 @@ def __getitem__(self, sentence):
 
         """
         warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
-        try:
-            is_single = not sentence or isinstance(sentence[0], string_types)
-        except:
-            is_single = False
+
+        is_single, sentence = _is_single(sentence)
         if not is_single:
             # if the input is an entire corpus (rather than a single sentence),
             # return an iterable stream.
@@ -327,7 +350,6 @@ def __init__(self, phrases_model):
                 logger.info('Phraser added %i phrasegrams', count)
         logger.info('Phraser built with %i %i phrasegrams', count, len(self.phrasegrams))
 
-
     def __getitem__(self, sentence):
         """
         Convert the input tokens `sentence` (=list of unicode strings) into phrase
@@ -339,10 +361,7 @@ def __getitem__(self, sentence):
         into phrases on the fly, one after another.
 
         """
-        try:
-            is_single = not sentence or isinstance(sentence[0], string_types)
-        except:
-            is_single = False
+        is_single, sentence = _is_single(sentence)
         if not is_single:
             # if the input is an entire corpus (rather than a single sentence),
             # return an iterable stream.

diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
@@ -38,6 +38,10 @@
 unicode_sentences = [[utils.to_unicode(w) for w in sentence] for sentence in sentences]
 
 
+def gen_sentences():
+    return ((w for w in sentence) for sentence in sentences)
+
+
 class TestPhrasesCommon(unittest.TestCase):
     """ Tests that need to be run for both Prases and Phraser classes."""
     def setUp(self):
@@ -46,11 +50,29 @@ def setUp(self):
         self.bigram_utf8 = Phrases(sentences, min_count=1, threshold=1)
         self.bigram_unicode = Phrases(unicode_sentences, min_count=1, threshold=1)
 
+    def testEmptyInputsOnBigramConstruction(self):
+        """Test that empty inputs don't throw errors and return the expected result."""
+        # Empty list -> empty list
+        self.assertEqual(list(self.bigram_default[[]]), [])
+        # Empty iterator -> empty list
+        self.assertEqual(list(self.bigram_default[iter(())]), [])
+        # List of empty list -> list of empty list
+        self.assertEqual(list(self.bigram_default[[[], []]]), [[], []])
+        # Iterator of empty list -> list of empty list
+        self.assertEqual(list(self.bigram_default[iter([[], []])]), [[], []])
+        # Iterator of empty iterator -> list of empty list
+        self.assertEqual(list(self.bigram_default[(iter(()) for i in range(2))]), [[], []])
+
     def testSentenceGeneration(self):
         """Test basic bigram using a dummy corpus."""
         # test that we generate the same amount of sentences as the input
         self.assertEqual(len(sentences), len(list(self.bigram_default[sentences])))
 
+    def testSentenceGenerationWithGenerator(self):
+        """Test basic bigram production when corpus is a generator."""
+        self.assertEqual(len(list(gen_sentences())),
+                         len(list(self.bigram_default[gen_sentences()])))
+
     def testBigramConstruction(self):
         """Test Phrases bigram construction building."""
         # with this setting we should get response_time and graph_minors
@@ -75,6 +97,20 @@ def testBigramConstruction(self):
         self.assertTrue(u'graph_minors' in self.bigram[sentences[-1]])
         self.assertTrue(u'human_interface' in self.bigram[sentences[-1]])
 
+    def testBigramConstructionFromGenerator(self):
+        """Test Phrases bigram construction building when corpus is a generator"""
+        bigram1_seen = False
+        bigram2_seen = False
+
+        for s in self.bigram[gen_sentences()]:
+            if not bigram1_seen and 'response_time' in s:
+                bigram1_seen = True
+            if not bigram2_seen and 'graph_minors' in s:
+                bigram2_seen = True
+            if bigram1_seen and bigram2_seen:
+                break
+        self.assertTrue(bigram1_seen and bigram2_seen)
+
     def testEncoding(self):
         """Test that both utf8 and unicode input work; output must be unicode."""
         expected = [u'survey', u'user', u'computer', u'system', u'response_time']