Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce Phraser memory usage (drop frequencies) #2208

Merged
merged 12 commits into from
Jan 11, 2019
11 changes: 9 additions & 2 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,13 @@ def load(cls, *args, **kwargs):
"""
model = super(PhrasesTransformation, cls).load(*args, **kwargs)
# update older models
# if value in phrasegrams dict is a tuple, load only the scores.

for component, score in getattr(model, "phrasegrams", {}).items():
if isinstance(score, tuple):
frequency, score_val = score
model.phrasegrams[component] = score_val

# if no scoring parameter, use default scoring
if not hasattr(model, 'scoring'):
logger.info('older version of %s loaded without scoring function', cls.__name__)
Expand Down Expand Up @@ -815,7 +822,7 @@ def __init__(self, phrases_model):
for bigram, score in phrases_model.export_phrases(corpus, self.delimiter, as_tuples=True):
if bigram in self.phrasegrams:
logger.info('Phraser repeat %s', bigram)
self.phrasegrams[bigram] = (phrases_model.vocab[self.delimiter.join(bigram)], score)
self.phrasegrams[bigram] = score
count += 1
if not count % 50000:
logger.info('Phraser added %i phrasegrams', count)
Expand Down Expand Up @@ -858,7 +865,7 @@ def score_item(self, worda, wordb, components, scorer):

"""
try:
return self.phrasegrams[tuple(components)][1]
return self.phrasegrams[tuple(components)]
except KeyError:
return -1

Expand Down
Binary file added gensim/test/test_data/phraser-3.6.0.model
Binary file not shown.
Binary file added gensim/test/test_data/phrases-3.6.0.model
Binary file not shown.
17 changes: 16 additions & 1 deletion gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import unittest

import six

import numpy as np

from gensim.utils import to_unicode
Expand Down Expand Up @@ -646,6 +645,22 @@ def testEncoding(self):
self.assertTrue(isinstance(transformed, six.text_type))


class TestPhraserModelCompatibilty(unittest.TestCase):

def testCompatibilty(self):
phr = Phraser.load(datapath("phraser-3.6.0.model"))
model = Phrases.load(datapath("phrases-3.6.0.model"))

test_sentences = ['trees', 'graph', 'minors']
expected_res = ['trees', 'graph_minors']

phr_out = phr[test_sentences]
model_out = model[test_sentences]

self.assertEqual(phr_out, expected_res)
self.assertEqual(model_out, expected_res)


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()