From 16e988f3a146d2d6a4c53642f420136dd3709f76 Mon Sep 17 00:00:00 2001 From: Rob Malouf Date: Mon, 19 Mar 2018 18:55:37 -0700 Subject: [PATCH 1/3] fix _is_single element-wise comparison error --- gensim/models/phrases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 gensim/models/phrases.py diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py old mode 100644 new mode 100755 index 30a8913745..1628a80460 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -118,7 +118,7 @@ def _is_single(obj): if isinstance(peek, string_types): # It's a document, return the iterator return True, obj_iter - if temp_iter == obj: + if temp_iter is obj: # Checking for iterator to the object return False, obj_iter else: From f97910e2a214d3941712def1b7d133eb99481266 Mon Sep 17 00:00:00 2001 From: Rob Malouf Date: Tue, 20 Mar 2018 07:59:36 -0700 Subject: [PATCH 2/3] fix mode --- gensim/models/phrases.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 gensim/models/phrases.py diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py old mode 100755 new mode 100644 From 4854bc5da137b515c2f07e2c4b30c11216694e7c Mon Sep 17 00:00:00 2001 From: Rob Malouf Date: Tue, 20 Mar 2018 08:20:27 -0700 Subject: [PATCH 3/3] add test for corpus as numpy array --- gensim/test/test_phrases.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 58d0cfff93..e83bf5b2b9 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -13,6 +13,8 @@ import six +import numpy as np + from gensim.utils import to_unicode from gensim.models.phrases import SentenceAnalyzer, Phrases, Phraser from gensim.models.phrases import pseudocorpus, original_scorer @@ -148,7 +150,7 @@ def gen_sentences(self): class PhrasesCommon: - """ Tests that need to be run for both Prases and Phraser classes.""" + """ Tests that need to be run for both Phrases and Phraser classes.""" def setUp(self): self.bigram = Phrases( @@ -230,6 +232,20 @@ def testBigramConstructionFromGenerator(self): break self.assertTrue(bigram1_seen and bigram2_seen) + def testBigramConstructionFromArray(self): + """Test Phrases bigram construction building when corpus is a numpy array""" + bigram1_seen = False + bigram2_seen = False + + for s in self.bigram[np.array(self.sentences)]: + if not bigram1_seen and self.bigram1 in s: + bigram1_seen = True + if not bigram2_seen and self.bigram2 in s: + bigram2_seen = True + if bigram1_seen and bigram2_seen: + break + self.assertTrue(bigram1_seen and bigram2_seen) + def testEncoding(self): """Test that both utf8 and unicode input work; output must be unicode.""" expected = [u'survey', u'user', u'computer', u'system', u'response_time']