From 242c80ede3ba60152da7eee21703b48657cb260e Mon Sep 17 00:00:00 2001 From: jeni Shah Date: Sat, 20 Oct 2018 18:52:10 +0530 Subject: [PATCH] fix phraser memory --- gensim/models/phrases.py | 4 ++-- gensim/test/test_data/phraser_model_3dot6 | Bin 0 -> 543 bytes gensim/test/test_phrases.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) create mode 100644 gensim/test/test_data/phraser_model_3dot6 diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 9d8a5f5da6..247f5532a9 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -805,7 +805,7 @@ def __init__(self, phrases_model): for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True): if bigram in self.phrasegrams: logger.info('Phraser repeat %s', bigram) - self.phrasegrams[bigram] = (phrases_model.vocab[self.delimiter.join(bigram)], score) + self.phrasegrams[bigram] = (None, score) count += 1 if not count % 50000: logger.info('Phraser added %i phrasegrams', count) @@ -848,7 +848,7 @@ def score_item(self, worda, wordb, components, scorer): """ try: - return self.phrasegrams[tuple(components)][1] + return self.phrasegrams[tuple(components)][-1] except KeyError: return -1 diff --git a/gensim/test/test_data/phraser_model_3dot6 b/gensim/test/test_data/phraser_model_3dot6 new file mode 100644 index 0000000000000000000000000000000000000000..ea6e2643744ebc22c39c2d744eac07203da18b2c GIT binary patch literal 543 zcmZ{g%We}f6o!*Dmu6ZZ6aoc$0b1G=XxhRHuw#)8nw2*rYvQ;wmd18|d!|w=RicZ| z!|)=!7RS?Uy1=qz>pTB{ZofMslg2AO)pj8ZKV9Z7_0mVbn%6~Wo!p^)i_U}%Tdh`^ zJLz*s zJiEsz#&fnoriWQGmy&1e7B41wpLL0sN=v-VsjYF0S1VX~8t8R^+cDnMcsuL-{IUI~ zIW^vqsu!$IN{^i&+l{feqEup;V?WUMF+SAz7|bL5-Jx6K;F0Vw$Uen5A{v%N3%1VV zIN(W)lNz5NmHtw(b&_#zD!sh+_?qB!MZ8dfOcR{3sL`m5lF5a~d4dZzZkUsza$a3Y R?$aw-*p$NG5`3@b(|^@5sb&BG literal 0 HcmV?d00001 diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index e83bf5b2b9..717a8dda25 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -646,6 +646,16 @@ def testEncoding(self): self.assertTrue(isinstance(transformed, six.text_type)) +class TestPhraserModelCompatibilty(unittest.TestCase): + + def testCompatibilty(self): + bigram_loaded = Phraser.load(datapath("phraser_model_3dot6")) + test_sentences = [u'trees', u'graph', u'minors'] + prev_ver = bigram_loaded[test_sentences] + expected_res = ['trees_graph', 'minors'] + self.assertEqual(prev_ver, expected_res) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main()