piskvorky · prakhar2b · Jun 14, 2017 · Jun 14, 2017 · Jun 21, 2017 · Jun 21, 2017
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
@@ -169,7 +169,9 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
             if sentence_no % progress_per == 0:
                 logger.info("PROGRESS: at sentence #%i, processed %i words and %i word types" %
                             (sentence_no, total_words, len(vocab)))
-            sentence = [utils.any2utf8(w) for w in sentence]
+
+            sentence = [w for w in (utils.any2utf8(u'_'.join(sentence)).split('_'))]
+
             for bigram in zip(sentence, sentence[1:]):
                 vocab[bigram[0]] += 1
                 vocab[delimiter.join(bigram)] += 1
@@ -227,7 +229,7 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
             then you can debug the threshold with generated tsv
         """
         for sentence in sentences:
-            s = [utils.any2utf8(w) for w in sentence]
+            s = [w for w in (utils.any2utf8(u'_'.join(sentence)).split('_'))]
             last_bigram = False
             vocab = self.vocab
             threshold = self.threshold

diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
@@ -165,6 +165,7 @@ def testPruning(self):
         """Test that max_vocab_size parameter is respected."""
         bigram = Phrases(sentences, max_vocab_size=5)
         self.assertTrue(len(bigram.vocab) <= 5)
+
 #endclass TestPhrasesModel