diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 9d8a5f5da6..247f5532a9 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -805,7 +805,7 @@ def __init__(self, phrases_model): for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True): if bigram in self.phrasegrams: logger.info('Phraser repeat %s', bigram) - self.phrasegrams[bigram] = (phrases_model.vocab[self.delimiter.join(bigram)], score) + self.phrasegrams[bigram] = (None, score) count += 1 if not count % 50000: logger.info('Phraser added %i phrasegrams', count) @@ -848,7 +848,7 @@ def score_item(self, worda, wordb, components, scorer): """ try: - return self.phrasegrams[tuple(components)][1] + return self.phrasegrams[tuple(components)][-1] except KeyError: return -1 diff --git a/gensim/test/test_data/phraser_model_3dot6 b/gensim/test/test_data/phraser_model_3dot6 new file mode 100644 index 0000000000..ea6e264374 Binary files /dev/null and b/gensim/test/test_data/phraser_model_3dot6 differ diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index e83bf5b2b9..717a8dda25 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -646,6 +646,16 @@ def testEncoding(self): self.assertTrue(isinstance(transformed, six.text_type)) +class TestPhraserModelCompatibilty(unittest.TestCase): + + def testCompatibilty(self): + bigram_loaded = Phraser.load(datapath("phraser_model_3dot6")) + test_sentences = [u'trees', u'graph', u'minors'] + prev_ver = bigram_loaded[test_sentences] + expected_res = ['trees_graph', 'minors'] + self.assertEqual(prev_ver, expected_res) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main()